Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2396,6 +2396,66 @@ return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; } +static bool isLoadOrFreeCast(X86TTIImpl *TTI, const Value *V) { + if (isa(V)) + return true; + const auto *I = dyn_cast(V); + if (!I) + return false; + switch (I->getOpcode()) { + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + // Check if the cast of the load is free if the cast operation is free. + return I->hasOneUse() && + TTI->getCastInstrCost(I->getOpcode(), I->getType(), + I->getOperand(0)->getType(), I) == 0 && + isa(I->getOperand(0)); + default: + break; + } + return false; +} + +static bool isFreeOp(X86TTIImpl *TTI, const Instruction *I, const Type *Ty, + const Instruction *OpI) { + switch (I->getOpcode()) { + case Instruction::Sub: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FSub: + case Instruction::FDiv: + return I->getOperand(1) == OpI; + case Instruction::Shl: + case Instruction::AShr: + case Instruction::LShr: + return I->getOperand(0) == OpI && Ty->getScalarSizeInBits() <= 64; + case Instruction::Add: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + return !isLoadOrFreeCast(TTI, I->getOperand(0)) || + !isLoadOrFreeCast(TTI, I->getOperand(1)) || I->getOperand(1) == OpI; + case Instruction::Mul: + // The load instruction is folded only if the size of the data is i8, i16, + // i32 or i64. + return (!isLoadOrFreeCast(TTI, I->getOperand(0)) || + !isLoadOrFreeCast(TTI, I->getOperand(1)) || + I->getOperand(1) == OpI) && + Ty->getScalarSizeInBits() <= 64; + case Instruction::FAdd: + case Instruction::FMul: + return !isa(I->getOperand(0)) && + !isa(I->getOperand(1)) && + (!isLoadOrFreeCast(TTI, I->getOperand(0)) || + !isLoadOrFreeCast(TTI, I->getOperand(1)) || + I->getOperand(1) == OpI); + } + return false; +} + int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I) { // Handle non-power-of-two vectors such as <3 x float> @@ -2421,6 +2481,27 @@ Opcode == Instruction::Store); return NumElem * Cost + SplitCost; } + } else if (I && isa(I) && I->hasOneUse()) { + // Check if the cost of the scalar load can be considered as 0. WE can + // consider it 0, if arithmetic|logic|compare|cast instruction allows to use + // memory address as one of its argument. + const Instruction *UI = I->user_back(); + if (isFreeOp(this, UI, UI->getType(), I)) + return 0; + switch (UI->getOpcode()) { + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + // Check if the cast of the load is free if the cast operation is free. + if (UI->hasOneUse() && + getCastInstrCost(UI->getOpcode(), UI->getType(), + UI->getOperand(0)->getType(), UI) == 0 && + isFreeOp(this, UI->user_back(), UI->getType(), UI)) + return 0; + break; + default: + break; + } } // Legalize the type. Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3119,15 +3119,26 @@ } case Instruction::Load: { // Cost of wide load - cost of scalar loads. - unsigned alignment = cast(VL0)->getAlignment(); - int ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); + const auto *LI = cast(VL0); + unsigned alignment = LI->getAlignment(); + int ScalarLdCost = 0; if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + for (const unsigned I : E->ReuseShuffleIndices) { + const auto *LD = cast(VL[I]); + ReuseShuffleCost -= + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, + LD->getPointerAddressSpace(), LD); + } + } else { + for (Value *V : VL) { + const auto *LD = cast(V); + ScalarLdCost += + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, + LD->getPointerAddressSpace(), LD); + } } - int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; - int VecLdCost = - TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, + LI->getPointerAddressSpace()); if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( Index: test/Analysis/CostModel/X86/interleave-load-i32.ll =================================================================== --- test/Analysis/CostModel/X86/interleave-load-i32.ll +++ test/Analysis/CostModel/X86/interleave-load-i32.ll @@ -10,7 +10,7 @@ ; Function Attrs: nounwind uwtable define void @load_i32_interleave4() { ;CHECK-LABEL: load_i32_interleave4 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %0 = load +;CHECK: Found an estimated cost of 0 for VF 1 For instruction: %0 = load ;CHECK: Found an estimated cost of 5 for VF 2 For instruction: %0 = load ;CHECK: Found an estimated cost of 5 for VF 4 For instruction: %0 = load ;CHECK: Found an estimated cost of 8 for VF 8 For instruction: %0 = load @@ -46,7 +46,7 @@ define void @load_i32_interleave5() { ;CHECK-LABEL: load_i32_interleave5 -;CHECK: Found an estimated cost of 1 for VF 1 For instruction: %0 = load +;CHECK: Found an estimated cost of 0 for VF 1 For instruction: %0 = load ;CHECK: Found an estimated cost of 6 for VF 2 For instruction: %0 = load ;CHECK: Found an estimated cost of 9 for VF 4 For instruction: %0 = load ;CHECK: Found an estimated cost of 18 for VF 8 For instruction: %0 = load Index: test/Analysis/CostModel/X86/interleaved-load-i8.ll =================================================================== --- test/Analysis/CostModel/X86/interleaved-load-i8.ll +++ test/Analysis/CostModel/X86/interleaved-load-i8.ll @@ -6,7 +6,7 @@ ; Function Attrs: norecurse nounwind readonly uwtable define i32 @doit_stride3(i8* nocapture readonly %Ptr, i32 %Nels) { -;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %0 = load i8 ;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction: %0 = load i8 ;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8 ;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction: %0 = load i8 Index: test/Analysis/CostModel/X86/vectorized-loop.ll =================================================================== --- test/Analysis/CostModel/X86/vectorized-loop.ll +++ test/Analysis/CostModel/X86/vectorized-loop.ll @@ -53,12 +53,12 @@ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %end.idx.rnd.down, %middle.block ] %13 = add nsw i64 %indvars.iv, 2 %arrayidx = getelementptr inbounds i32, i32* %B, i64 %13 - ;CHECK: cost of 1 {{.*}} load + ;CHECK: cost of 0 {{.*}} load %14 = load i32, i32* %arrayidx, align 4 ;CHECK: cost of 1 {{.*}} mul %mul = mul nsw i32 %14, 5 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - ;CHECK: cost of 1 {{.*}} load + ;CHECK: cost of 0 {{.*}} load %15 = load i32, i32* %arrayidx2, align 4 %add3 = add nsw i32 %15, %mul store i32 %add3, i32* %arrayidx2, align 4 Index: test/Transforms/LoopVectorize/X86/gather_scatter.ll =================================================================== --- test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -234,135 +234,65 @@ define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { ; AVX512-LABEL: @foo2( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) -; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) -; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) -; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], -; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) -; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer -; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], -; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) -; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer -; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], -; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) -; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer -; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], -; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 +; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[INDVARS_IV]], i32 1 +; AVX512-NEXT: [[TMP1:%.*]] = load float, float* [[B]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP2]], 0 +; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX512: for.end: ; AVX512-NEXT: ret void +; AVX512: if.then.1: +; AVX512-NEXT: [[B_1:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV_NEXT]], i32 1 +; AVX512-NEXT: [[TMP3:%.*]] = load float, float* [[B_1]], align 4 +; AVX512-NEXT: [[ADD_1:%.*]] = fadd float [[TMP3]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: store float [[ADD_1]], float* [[ARRAYIDX5_1]], align 4 +; AVX512-NEXT: br label [[FOR_INC_1]] +; AVX512: for.inc.1: +; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 32 +; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP4]], 0 +; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX512: if.then.2: +; AVX512-NEXT: [[B_2:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV_NEXT_1]], i32 1 +; AVX512-NEXT: [[TMP5:%.*]] = load float, float* [[B_2]], align 4 +; AVX512-NEXT: [[ADD_2:%.*]] = fadd float [[TMP5]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: store float [[ADD_2]], float* [[ARRAYIDX5_2]], align 4 +; AVX512-NEXT: br label [[FOR_INC_2]] +; AVX512: for.inc.2: +; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 48 +; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP6]], 0 +; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX512: if.then.3: +; AVX512-NEXT: [[B_3:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV_NEXT_2]], i32 1 +; AVX512-NEXT: [[TMP7:%.*]] = load float, float* [[B_3]], align 4 +; AVX512-NEXT: [[ADD_3:%.*]] = fadd float [[TMP7]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: store float [[ADD_3]], float* [[ARRAYIDX5_3]], align 4 +; AVX512-NEXT: br label [[FOR_INC_3]] +; AVX512: for.inc.3: +; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 64 +; AVX512-NEXT: [[CMP_3:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT_3]], 4096 +; AVX512-NEXT: br i1 [[CMP_3]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; ; FVW2-LABEL: @foo2( ; FVW2-NEXT: entry: @@ -550,135 +480,65 @@ define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) { ; AVX512-LABEL: @foo3( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) -; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6]], -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) -; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_1]], -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_2]], -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) -; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_3]], -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_4]], -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_5]], -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_6]], -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_7]], -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_8]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_9]], -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_10]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_11]], -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_12]], -; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) -; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer -; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_13]], -; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) -; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer -; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_14]], -; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) -; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer -; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_15]], -; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 +; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[INDVARS_IV]], i32 1 +; AVX512-NEXT: [[TMP1:%.*]] = load float, float* [[B]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], 5.000000e-01 +; AVX512-NEXT: [[B6:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[INDVARS_IV]], i32 1 +; AVX512-NEXT: store float [[ADD]], float* [[B6]], align 4 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP2]], 0 +; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX512: for.end: ; AVX512-NEXT: ret void +; AVX512: if.then.1: +; AVX512-NEXT: [[B_1:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV_NEXT]], i32 1 +; AVX512-NEXT: [[TMP3:%.*]] = load float, float* [[B_1]], align 4 +; AVX512-NEXT: [[ADD_1:%.*]] = fadd float [[TMP3]], 5.000000e-01 +; AVX512-NEXT: [[B6_1:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[INDVARS_IV_NEXT]], i32 1 +; AVX512-NEXT: store float [[ADD_1]], float* [[B6_1]], align 4 +; AVX512-NEXT: br label [[FOR_INC_1]] +; AVX512: for.inc.1: +; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 32 +; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP4]], 0 +; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX512: if.then.2: +; AVX512-NEXT: [[B_2:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV_NEXT_1]], i32 1 +; AVX512-NEXT: [[TMP5:%.*]] = load float, float* [[B_2]], align 4 +; AVX512-NEXT: [[ADD_2:%.*]] = fadd float [[TMP5]], 5.000000e-01 +; AVX512-NEXT: [[B6_2:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[INDVARS_IV_NEXT_1]], i32 1 +; AVX512-NEXT: store float [[ADD_2]], float* [[B6_2]], align 4 +; AVX512-NEXT: br label [[FOR_INC_2]] +; AVX512: for.inc.2: +; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 48 +; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP6]], 0 +; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX512: if.then.3: +; AVX512-NEXT: [[B_3:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV_NEXT_2]], i32 1 +; AVX512-NEXT: [[TMP7:%.*]] = load float, float* [[B_3]], align 4 +; AVX512-NEXT: [[ADD_3:%.*]] = fadd float [[TMP7]], 5.000000e-01 +; AVX512-NEXT: [[B6_3:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[INDVARS_IV_NEXT_2]], i32 1 +; AVX512-NEXT: store float [[ADD_3]], float* [[B6_3]], align 4 +; AVX512-NEXT: br label [[FOR_INC_3]] +; AVX512: for.inc.3: +; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 64 +; AVX512-NEXT: [[CMP_3:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT_3]], 4096 +; AVX512-NEXT: br i1 [[CMP_3]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; ; FVW2-LABEL: @foo3( ; FVW2-NEXT: entry: @@ -852,135 +712,65 @@ define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { ; AVX512-LABEL: @foo2_addrspace( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) -; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) -; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) -; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], -; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) -; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer -; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], -; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) -; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer -; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], -; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) -; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer -; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], -; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 +; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], i64 [[INDVARS_IV]], i32 1 +; AVX512-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[B]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP2]], 0 +; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX512: for.end: ; AVX512-NEXT: ret void +; AVX512: if.then.1: +; AVX512-NEXT: [[B_1:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV_NEXT]], i32 1 +; AVX512-NEXT: [[TMP3:%.*]] = load float, float addrspace(1)* [[B_1]], align 4 +; AVX512-NEXT: [[ADD_1:%.*]] = fadd float [[TMP3]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: store float [[ADD_1]], float addrspace(1)* [[ARRAYIDX5_1]], align 4 +; AVX512-NEXT: br label [[FOR_INC_1]] +; AVX512: for.inc.1: +; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 32 +; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP4]], 0 +; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX512: if.then.2: +; AVX512-NEXT: [[B_2:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV_NEXT_1]], i32 1 +; AVX512-NEXT: [[TMP5:%.*]] = load float, float addrspace(1)* [[B_2]], align 4 +; AVX512-NEXT: [[ADD_2:%.*]] = fadd float [[TMP5]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: store float [[ADD_2]], float addrspace(1)* [[ARRAYIDX5_2]], align 4 +; AVX512-NEXT: br label [[FOR_INC_2]] +; AVX512: for.inc.2: +; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 48 +; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP6]], 0 +; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX512: if.then.3: +; AVX512-NEXT: [[B_3:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV_NEXT_2]], i32 1 +; AVX512-NEXT: [[TMP7:%.*]] = load float, float addrspace(1)* [[B_3]], align 4 +; AVX512-NEXT: [[ADD_3:%.*]] = fadd float [[TMP7]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: store float [[ADD_3]], float addrspace(1)* [[ARRAYIDX5_3]], align 4 +; AVX512-NEXT: br label [[FOR_INC_3]] +; AVX512: for.inc.3: +; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 64 +; AVX512-NEXT: [[CMP_3:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT_3]], 4096 +; AVX512-NEXT: br i1 [[CMP_3]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; ; FVW2-LABEL: @foo2_addrspace( ; FVW2-NEXT: entry: @@ -1154,135 +944,65 @@ define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) { ; AVX512-LABEL: @foo2_addrspace2( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) -; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) -; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) -; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], -; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) -; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer -; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], -; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) -; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer -; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], -; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) -; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer -; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], -; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 +; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], i64 [[INDVARS_IV]], i32 1 +; AVX512-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[B]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP2]], 0 +; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX512: for.end: ; AVX512-NEXT: ret void +; AVX512: if.then.1: +; AVX512-NEXT: [[B_1:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV_NEXT]], i32 1 +; AVX512-NEXT: [[TMP3:%.*]] = load float, float addrspace(1)* [[B_1]], align 4 +; AVX512-NEXT: [[ADD_1:%.*]] = fadd float [[TMP3]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: store float [[ADD_1]], float* [[ARRAYIDX5_1]], align 4 +; AVX512-NEXT: br label [[FOR_INC_1]] +; AVX512: for.inc.1: +; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 32 +; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP4]], 0 +; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX512: if.then.2: +; AVX512-NEXT: [[B_2:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV_NEXT_1]], i32 1 +; AVX512-NEXT: [[TMP5:%.*]] = load float, float addrspace(1)* [[B_2]], align 4 +; AVX512-NEXT: [[ADD_2:%.*]] = fadd float [[TMP5]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: store float [[ADD_2]], float* [[ARRAYIDX5_2]], align 4 +; AVX512-NEXT: br label [[FOR_INC_2]] +; AVX512: for.inc.2: +; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 48 +; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP6]], 0 +; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX512: if.then.3: +; AVX512-NEXT: [[B_3:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV_NEXT_2]], i32 1 +; AVX512-NEXT: [[TMP7:%.*]] = load float, float addrspace(1)* [[B_3]], align 4 +; AVX512-NEXT: [[ADD_3:%.*]] = fadd float [[TMP7]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: store float [[ADD_3]], float* [[ARRAYIDX5_3]], align 4 +; AVX512-NEXT: br label [[FOR_INC_3]] +; AVX512: for.inc.3: +; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 64 +; AVX512-NEXT: [[CMP_3:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT_3]], 4096 +; AVX512-NEXT: br i1 [[CMP_3]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; ; FVW2-LABEL: @foo2_addrspace2( ; FVW2-NEXT: entry: @@ -1456,135 +1176,65 @@ define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) { ; AVX512-LABEL: @foo2_addrspace3( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) -; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) -; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) -; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], -; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) -; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer -; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], -; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) -; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer -; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], -; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) -; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer -; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], -; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 +; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[INDVARS_IV]], i32 1 +; AVX512-NEXT: [[TMP1:%.*]] = load float, float* [[B]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP2]], 0 +; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX512: for.end: ; AVX512-NEXT: ret void +; AVX512: if.then.1: +; AVX512-NEXT: [[B_1:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV_NEXT]], i32 1 +; AVX512-NEXT: [[TMP3:%.*]] = load float, float* [[B_1]], align 4 +; AVX512-NEXT: [[ADD_1:%.*]] = fadd float [[TMP3]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: store float [[ADD_1]], float addrspace(1)* [[ARRAYIDX5_1]], align 4 +; AVX512-NEXT: br label [[FOR_INC_1]] +; AVX512: for.inc.1: +; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 32 +; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP4]], 0 +; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX512: if.then.2: +; AVX512-NEXT: [[B_2:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV_NEXT_1]], i32 1 +; AVX512-NEXT: [[TMP5:%.*]] = load float, float* [[B_2]], align 4 +; AVX512-NEXT: [[ADD_2:%.*]] = fadd float [[TMP5]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: store float [[ADD_2]], float addrspace(1)* [[ARRAYIDX5_2]], align 4 +; AVX512-NEXT: br label [[FOR_INC_2]] +; AVX512: for.inc.2: +; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 48 +; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP6]], 0 +; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX512: if.then.3: +; AVX512-NEXT: [[B_3:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV_NEXT_2]], i32 1 +; AVX512-NEXT: [[TMP7:%.*]] = load float, float* [[B_3]], align 4 +; AVX512-NEXT: [[ADD_3:%.*]] = fadd float [[TMP7]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: store float [[ADD_3]], float addrspace(1)* [[ARRAYIDX5_3]], align 4 +; AVX512-NEXT: br label [[FOR_INC_3]] +; AVX512: for.inc.3: +; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 64 +; AVX512-NEXT: [[CMP_3:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT_3]], 4096 +; AVX512-NEXT: br i1 [[CMP_3]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; ; FVW2-LABEL: @foo2_addrspace3( ; FVW2-NEXT: entry: Index: test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -1581,178 +1581,28 @@ ;} define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %size, i32* nocapture readonly %trigger) local_unnamed_addr #0 { -; AVX1-LABEL: @foo6( -; AVX1-NEXT: entry: -; AVX1-NEXT: br label [[FOR_BODY:%.*]] -; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 4095, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 -; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; AVX1: if.then: -; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN:%.*]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e-01 -; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDVARS_IV]] -; AVX1-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 -; AVX1-NEXT: br label [[FOR_INC]] -; AVX1: for.inc: -; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; AVX1-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX1-NEXT: br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; AVX1: for.end: -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @foo6( -; AVX2-NEXT: entry: -; AVX2-NEXT: [[OUT1:%.*]] = bitcast double* [[OUT:%.*]] to i8* -; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* -; AVX2-NEXT: [[IN6:%.*]] = bitcast double* [[IN:%.*]] to i8* -; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; AVX2: vector.memcheck: -; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT]], i64 4096 -; AVX2-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8* -; AVX2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 4096 -; AVX2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* -; AVX2-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[IN]], i64 4096 -; AVX2-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8* -; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP45]] -; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX2-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP78]] -; AVX2-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[IN6]], [[SCEVGEP2]] -; AVX2-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] -; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] -; AVX2-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true -; AVX2-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; AVX2: vector.ph: -; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX2: vector.body: -; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] -; AVX2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0 -; AVX2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION12:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION13:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION14:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 -; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -4 -; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -8 -; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -12 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3 -; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 -; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -4 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3 -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 -; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD15]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8 -; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -3 -; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !41 -; AVX2-NEXT: [[REVERSE18:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD17]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -12 -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -3 -; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !41 -; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD19]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP20:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer -; AVX2-NEXT: [[TMP21:%.*]] = icmp sgt <4 x i32> [[REVERSE16]], zeroinitializer -; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt <4 x i32> [[REVERSE18]], zeroinitializer -; AVX2-NEXT: [[TMP23:%.*]] = icmp sgt <4 x i32> [[REVERSE20]], zeroinitializer -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 0 -; AVX2-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP28]], i32 -3 -; AVX2-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i1> [[TMP20]], <4 x i1> undef, <4 x i32> -; AVX2-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE21]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> undef, <4 x i32> -; AVX2-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -4 -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP31]], i32 -3 -; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i1> [[TMP21]], <4 x i1> undef, <4 x i32> -; AVX2-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD24]], <4 x double> undef, <4 x i32> -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -8 -; AVX2-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i32 -3 -; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP22]], <4 x i1> undef, <4 x i32> -; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD27]], <4 x double> undef, <4 x i32> -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -12 -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i32 -3 -; AVX2-NEXT: [[REVERSE29:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> undef, <4 x i32> -; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD30:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE29]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD30]], <4 x double> undef, <4 x i32> -; AVX2-NEXT: [[TMP40:%.*]] = fadd <4 x double> [[REVERSE22]], -; AVX2-NEXT: [[TMP41:%.*]] = fadd <4 x double> [[REVERSE25]], -; AVX2-NEXT: [[TMP42:%.*]] = fadd <4 x double> [[REVERSE28]], -; AVX2-NEXT: [[TMP43:%.*]] = fadd <4 x double> [[REVERSE31]], -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] -; AVX2-NEXT: [[REVERSE32:%.*]] = shufflevector <4 x double> [[TMP40]], <4 x double> undef, <4 x i32> -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 -; AVX2-NEXT: [[TMP49:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 -3 -; AVX2-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE32]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE21]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> undef, <4 x i32> -; AVX2-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -4 -; AVX2-NEXT: [[TMP52:%.*]] = getelementptr inbounds double, double* [[TMP51]], i32 -3 -; AVX2-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE34]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[REVERSE36:%.*]] = shufflevector <4 x double> [[TMP42]], <4 x double> undef, <4 x i32> -; AVX2-NEXT: [[TMP54:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -8 -; AVX2-NEXT: [[TMP55:%.*]] = getelementptr inbounds double, double* [[TMP54]], i32 -3 -; AVX2-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE36]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[REVERSE38:%.*]] = shufflevector <4 x double> [[TMP43]], <4 x double> undef, <4 x i32> -; AVX2-NEXT: [[TMP57:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -12 -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP57]], i32 -3 -; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE38]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE29]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX2-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 -; AVX2: middle.block: -; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 -; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ] -; AVX2-NEXT: br label [[FOR_BODY:%.*]] -; AVX2: for.body: -; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP61:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP61]], 0 -; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; AVX2: if.then: -; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP62:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP62]], 5.000000e-01 -; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX2-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 -; AVX2-NEXT: br label [[FOR_INC]] -; AVX2: for.inc: -; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; AVX2-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50 -; AVX2: for.end: -; AVX2-NEXT: ret void +; AVX-LABEL: @foo6( +; AVX-NEXT: entry: +; AVX-NEXT: br label [[FOR_BODY:%.*]] +; AVX: for.body: +; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 4095, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 +; AVX-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; AVX: if.then: +; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e-01 +; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 +; AVX-NEXT: br label [[FOR_INC]] +; AVX: for.inc: +; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; AVX-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 +; AVX-NEXT: br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; AVX: for.end: +; AVX-NEXT: ret void ; ; AVX512-LABEL: @foo6( ; AVX512-NEXT: entry: @@ -1940,265 +1790,37 @@ ; } define void @foo7(double* noalias nocapture %out, double** noalias nocapture readonly %in, i8* noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 { -; AVX1-LABEL: @foo7( -; AVX1-NEXT: entry: -; AVX1-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 -; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX1: for.body.preheader: -; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 -; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 -; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; AVX1: vector.ph: -; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 -; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX1: vector.body: -; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; AVX1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; AVX1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 -; AVX1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 -; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 -; AVX1-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 -; AVX1-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 -; AVX1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 -; AVX1-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD4]], -; AVX1-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD5]], -; AVX1-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD6]], -; AVX1-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer -; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer -; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer -; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], -; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], -; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], -; AVX1-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 0 -; AVX1-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> undef) -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 4 -; AVX1-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> undef) -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8 -; AVX1-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> undef) -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 12 -; AVX1-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> undef) -; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD7]], zeroinitializer -; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD8]], zeroinitializer -; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD9]], zeroinitializer -; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], -; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], -; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], -; AVX1-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], -; AVX1-NEXT: [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]] -; AVX1-NEXT: [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]] -; AVX1-NEXT: [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]] -; AVX1-NEXT: [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]] -; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 -; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4 -; AVX1-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX1-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8 -; AVX1-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX1-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12 -; AVX1-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) -; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !41 -; AVX1: middle.block: -; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; AVX1-NEXT: br label [[FOR_BODY:%.*]] -; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX1-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1 -; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0 -; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] -; AVX1: land.lhs.true: -; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX1-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP67]], null -; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] -; AVX1: if.then: -; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX1-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 -; AVX1-NEXT: br label [[FOR_INC]] -; AVX1: for.inc: -; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !42 -; AVX1: for.end.loopexit: -; AVX1-NEXT: br label [[FOR_END]] -; AVX1: for.end: -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @foo7( -; AVX2-NEXT: entry: -; AVX2-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 -; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX2: for.body.preheader: -; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 -; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 -; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; AVX2: vector.ph: -; AVX2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 -; AVX2-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX2: vector.body: -; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; AVX2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 -; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 -; AVX2-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 -; AVX2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 -; AVX2-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD4]], -; AVX2-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD5]], -; AVX2-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD6]], -; AVX2-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer -; AVX2-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer -; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer -; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], -; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], -; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], -; AVX2-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 0 -; AVX2-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> undef) -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 4 -; AVX2-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> undef) -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8 -; AVX2-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> undef) -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 12 -; AVX2-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> undef) -; AVX2-NEXT: [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX2-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD7]], zeroinitializer -; AVX2-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD8]], zeroinitializer -; AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD9]], zeroinitializer -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], -; AVX2-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], -; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], -; AVX2-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], -; AVX2-NEXT: [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]] -; AVX2-NEXT: [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]] -; AVX2-NEXT: [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]] -; AVX2-NEXT: [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]] -; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 -; AVX2-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4 -; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX2-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8 -; AVX2-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX2-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12 -; AVX2-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) -; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 -; AVX2: middle.block: -; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; AVX2-NEXT: br label [[FOR_BODY:%.*]] -; AVX2: for.body: -; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX2-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1 -; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0 -; AVX2-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] -; AVX2: land.lhs.true: -; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX2-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP67]], null -; AVX2-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] -; AVX2: if.then: -; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX2-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 -; AVX2-NEXT: br label [[FOR_INC]] -; AVX2: for.inc: -; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !52 -; AVX2: for.end.loopexit: -; AVX2-NEXT: br label [[FOR_END]] -; AVX2: for.end: -; AVX2-NEXT: ret void +; AVX-LABEL: @foo7( +; AVX-NEXT: entry: +; AVX-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 +; AVX-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; AVX: for.body.preheader: +; AVX-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX-NEXT: br label [[FOR_BODY:%.*]] +; AVX: for.body: +; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], 1 +; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP1]], 0 +; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] +; AVX: land.lhs.true: +; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP2:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 +; AVX-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP2]], null +; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; AVX: if.then: +; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 +; AVX-NEXT: br label [[FOR_INC]] +; AVX: for.inc: +; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] +; AVX: for.end.loopexit: +; AVX-NEXT: br label [[FOR_END]] +; AVX: for.end: +; AVX-NEXT: ret void ; ; AVX512-LABEL: @foo7( ; AVX512-NEXT: entry: @@ -2375,265 +1997,37 @@ ;} define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture readonly %in, i8* noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 { -; AVX1-LABEL: @foo8( -; AVX1-NEXT: entry: -; AVX1-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 -; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX1: for.body.preheader: -; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 -; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 -; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; AVX1: vector.ph: -; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 -; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX1: vector.body: -; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; AVX1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; AVX1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 -; AVX1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 -; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 -; AVX1-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 -; AVX1-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 -; AVX1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 -; AVX1-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD4]], -; AVX1-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD5]], -; AVX1-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD6]], -; AVX1-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer -; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer -; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer -; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], -; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], -; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], -; AVX1-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 0 -; AVX1-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> undef) -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 4 -; AVX1-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> undef) -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8 -; AVX1-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> undef) -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 12 -; AVX1-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> undef) -; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD7]], zeroinitializer -; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD8]], zeroinitializer -; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD9]], zeroinitializer -; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], -; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], -; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], -; AVX1-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], -; AVX1-NEXT: [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]] -; AVX1-NEXT: [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]] -; AVX1-NEXT: [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]] -; AVX1-NEXT: [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]] -; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 -; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4 -; AVX1-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX1-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8 -; AVX1-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX1-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12 -; AVX1-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) -; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !44 -; AVX1: middle.block: -; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; AVX1-NEXT: br label [[FOR_BODY:%.*]] -; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX1-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1 -; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0 -; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] -; AVX1: land.lhs.true: -; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX1-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null -; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] -; AVX1: if.then: -; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX1-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 -; AVX1-NEXT: br label [[FOR_INC]] -; AVX1: for.inc: -; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !45 -; AVX1: for.end.loopexit: -; AVX1-NEXT: br label [[FOR_END]] -; AVX1: for.end: -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @foo8( -; AVX2-NEXT: entry: -; AVX2-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 -; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX2: for.body.preheader: -; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 -; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 -; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; AVX2: vector.ph: -; AVX2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 -; AVX2-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX2: vector.body: -; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; AVX2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 -; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 -; AVX2-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 -; AVX2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 -; AVX2-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD4]], -; AVX2-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD5]], -; AVX2-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD6]], -; AVX2-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer -; AVX2-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer -; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer -; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], -; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], -; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], -; AVX2-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 0 -; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> undef) -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 4 -; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> undef) -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8 -; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> undef) -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 12 -; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> undef) -; AVX2-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX2-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD7]], zeroinitializer -; AVX2-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD8]], zeroinitializer -; AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD9]], zeroinitializer -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], -; AVX2-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], -; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], -; AVX2-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], -; AVX2-NEXT: [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]] -; AVX2-NEXT: [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]] -; AVX2-NEXT: [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]] -; AVX2-NEXT: [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]] -; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 -; AVX2-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4 -; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX2-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8 -; AVX2-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX2-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12 -; AVX2-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) -; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 -; AVX2: middle.block: -; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; AVX2-NEXT: br label [[FOR_BODY:%.*]] -; AVX2: for.body: -; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX2-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1 -; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0 -; AVX2-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] -; AVX2: land.lhs.true: -; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX2-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null -; AVX2-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] -; AVX2: if.then: -; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -; AVX2-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 -; AVX2-NEXT: br label [[FOR_INC]] -; AVX2: for.inc: -; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !55 -; AVX2: for.end.loopexit: -; AVX2-NEXT: br label [[FOR_END]] -; AVX2: for.end: -; AVX2-NEXT: ret void +; AVX-LABEL: @foo8( +; AVX-NEXT: entry: +; AVX-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 +; AVX-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; AVX: for.body.preheader: +; AVX-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX-NEXT: br label [[FOR_BODY:%.*]] +; AVX: for.body: +; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], 1 +; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP1]], 0 +; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] +; AVX: land.lhs.true: +; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP2:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 +; AVX-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP2]], null +; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; AVX: if.then: +; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDVARS_IV]] +; AVX-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 +; AVX-NEXT: br label [[FOR_INC]] +; AVX: for.inc: +; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] +; AVX: for.end.loopexit: +; AVX-NEXT: br label [[FOR_END]] +; AVX: for.end: +; AVX-NEXT: ret void ; ; AVX512-LABEL: @foo8( ; AVX512-NEXT: entry: Index: test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll =================================================================== --- test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -31,14 +31,14 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4 +; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !1 @@ -92,33 +92,18 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized1( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6 -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7 -; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], [[TMP1]] +; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !7 ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -163,17 +148,17 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4 +; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !9 ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -190,7 +175,7 @@ ; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !11 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10 ; CHECK: for.end: ; CHECK-NEXT: ret void ; Index: test/Transforms/SLPVectorizer/X86/PR35777.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR35777.ll +++ test/Transforms/SLPVectorizer/X86/PR35777.ll @@ -6,20 +6,24 @@ define { i64, i64 } @patatino(double %arg) { ; CHECK-LABEL: @patatino( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([6 x double]* @global to <2 x double>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP11]], 1 +; CHECK-NEXT: [[TMP:%.*]] = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 0), align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = fmul double [[TMP1]], [[ARG:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = fadd double [[TMP]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4), align 16 +; CHECK-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = fptosi double [[TMP5]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 1), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 3), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = fmul double [[TMP9]], [[ARG]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd double [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 5), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = fadd double [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = fptosi double [[TMP13]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP7]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP15]], 1 ; CHECK-NEXT: ret { i64, i64 } [[TMP17]] ; bb: Index: test/Transforms/SLPVectorizer/X86/arith-add.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-add.ll +++ test/Transforms/SLPVectorizer/X86/arith-add.ll @@ -41,22 +41,38 @@ ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v8i64( -; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 +; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 +; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 +; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 +; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 +; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 +; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 +; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 +; SLM-NEXT: [[R0:%.*]] = add i64 [[A0]], [[B0]] +; SLM-NEXT: [[R1:%.*]] = add i64 [[A1]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = add i64 [[A2]], [[B2]] +; SLM-NEXT: [[R3:%.*]] = add i64 [[A3]], [[B3]] +; SLM-NEXT: [[R4:%.*]] = add i64 [[A4]], [[B4]] +; SLM-NEXT: [[R5:%.*]] = add i64 [[A5]], [[B5]] +; SLM-NEXT: [[R6:%.*]] = add i64 [[A6]], [[B6]] +; SLM-NEXT: [[R7:%.*]] = add i64 [[A7]], [[B7]] +; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @add_v8i64( Index: test/Transforms/SLPVectorizer/X86/arith-mul.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-mul.ll +++ test/Transforms/SLPVectorizer/X86/arith-mul.ll @@ -200,22 +200,70 @@ ; SSE-NEXT: ret void ; ; SLM-LABEL: @mul_v16i32( -; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 +; SLM-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 +; SLM-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 +; SLM-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 +; SLM-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 +; SLM-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 +; SLM-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 +; SLM-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 +; SLM-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 +; SLM-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 +; SLM-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 +; SLM-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 +; SLM-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 +; SLM-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 +; SLM-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 +; SLM-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 +; SLM-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 +; SLM-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 +; SLM-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 +; SLM-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 +; SLM-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 +; SLM-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 +; SLM-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 +; SLM-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 +; SLM-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 +; SLM-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 +; SLM-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 +; SLM-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 +; SLM-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 +; SLM-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 +; SLM-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 +; SLM-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 +; SLM-NEXT: [[R0:%.*]] = mul i32 [[A0]], [[B0]] +; SLM-NEXT: [[R1:%.*]] = mul i32 [[A1]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = mul i32 [[A2]], [[B2]] +; SLM-NEXT: [[R3:%.*]] = mul i32 [[A3]], [[B3]] +; SLM-NEXT: [[R4:%.*]] = mul i32 [[A4]], [[B4]] +; SLM-NEXT: [[R5:%.*]] = mul i32 [[A5]], [[B5]] +; SLM-NEXT: [[R6:%.*]] = mul i32 [[A6]], [[B6]] +; SLM-NEXT: [[R7:%.*]] = mul i32 [[A7]], [[B7]] +; SLM-NEXT: [[R8:%.*]] = mul i32 [[A8]], [[B8]] +; SLM-NEXT: [[R9:%.*]] = mul i32 [[A9]], [[B9]] +; SLM-NEXT: [[R10:%.*]] = mul i32 [[A10]], [[B10]] +; SLM-NEXT: [[R11:%.*]] = mul i32 [[A11]], [[B11]] +; SLM-NEXT: [[R12:%.*]] = mul i32 [[A12]], [[B12]] +; SLM-NEXT: [[R13:%.*]] = mul i32 [[A13]], [[B13]] +; SLM-NEXT: [[R14:%.*]] = mul i32 [[A14]], [[B14]] +; SLM-NEXT: [[R15:%.*]] = mul i32 [[A15]], [[B15]] +; SLM-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 +; SLM-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 +; SLM-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 +; SLM-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 +; SLM-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 +; SLM-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 +; SLM-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 +; SLM-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 +; SLM-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 +; SLM-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 +; SLM-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 +; SLM-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 +; SLM-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 +; SLM-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 +; SLM-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 +; SLM-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @mul_v16i32( Index: test/Transforms/SLPVectorizer/X86/arith-sub.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-sub.ll +++ test/Transforms/SLPVectorizer/X86/arith-sub.ll @@ -41,22 +41,38 @@ ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v8i64( -; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 +; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 +; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 +; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 +; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 +; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 +; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 +; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 +; SLM-NEXT: [[R0:%.*]] = sub i64 [[A0]], [[B0]] +; SLM-NEXT: [[R1:%.*]] = sub i64 [[A1]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = sub i64 [[A2]], [[B2]] +; SLM-NEXT: [[R3:%.*]] = sub i64 [[A3]], [[B3]] +; SLM-NEXT: [[R4:%.*]] = sub i64 [[A4]], [[B4]] +; SLM-NEXT: [[R5:%.*]] = sub i64 [[A5]], [[B5]] +; SLM-NEXT: [[R6:%.*]] = sub i64 [[A6]], [[B6]] +; SLM-NEXT: [[R7:%.*]] = sub i64 [[A7]], [[B7]] +; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @sub_v8i64( Index: test/Transforms/SLPVectorizer/X86/compare-reduce.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/compare-reduce.ll +++ test/Transforms/SLPVectorizer/X86/compare-reduce.ll @@ -10,21 +10,22 @@ ; CHECK-LABEL: @reduce_compare( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[CONV]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[CONV]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shl nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[CMP11:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[MUL1:%.*]] = fmul double [[CONV]], [[TMP1]] +; CHECK-NEXT: [[MUL2:%.*]] = fmul double [[MUL1]], 7.000000e+00 +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL2]], 5.000000e+00 +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = load double, double* [[ARRAYIDX6]], align 8 +; CHECK-NEXT: [[MUL8:%.*]] = fmul double [[CONV]], [[TMP3]] +; CHECK-NEXT: [[MUL9:%.*]] = fmul double [[MUL8]], 4.000000e+00 +; CHECK-NEXT: [[ADD10:%.*]] = fadd double [[MUL9]], 9.000000e+00 +; CHECK-NEXT: [[CMP11:%.*]] = fcmp ogt double [[ADD]], [[ADD10]] ; CHECK-NEXT: br i1 [[CMP11]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i64 0, i64 0)) Index: test/Transforms/SLPVectorizer/X86/consecutive-access.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/consecutive-access.ll +++ test/Transforms/SLPVectorizer/X86/consecutive-access.ll @@ -501,22 +501,23 @@ ; CHECK-NEXT: [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 -; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[X_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[Y_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[X_0_LCSSA]], [[Y_0_LCSSA]] ; CHECK-NEXT: ret double [[MUL]] ; CHECK: for.body: ; CHECK-NEXT: [[I_018:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x double> [ [[TMP6]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ] +; CHECK-NEXT: [[Y_017:%.*]] = phi double [ [[ADD4]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: [[X_016:%.*]] = phi double [ [[ADD]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ] ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_018]] to i64 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ADD]] = fadd double [[X_016]], [[TMP0]] ; CHECK-NEXT: [[ADD1:%.*]] = or i32 [[I_018]], 1 ; CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[ADD1]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6]] = fadd <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; CHECK-NEXT: [[ADD4]] = fadd double [[Y_017]], [[TMP1]] ; CHECK-NEXT: [[ADD5]] = add i32 [[I_018]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] Index: test/Transforms/SLPVectorizer/X86/cse.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/cse.ll +++ test/Transforms/SLPVectorizer/X86/cse.ll @@ -254,28 +254,27 @@ define i32 @partial_mrg(double* nocapture %A, i32 %n) { ; CHECK-LABEL: @partial_mrg( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[A:%.*]], align 8 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[CONV]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[CONV]], [[TMP0]] +; CHECK-NEXT: store double [[MUL]], double* [[A]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[MUL4:%.*]] = fmul double [[CONV]], [[TMP1]] +; CHECK-NEXT: store double [[MUL4]], double* [[ARRAYIDX2]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[N]], 4 ; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX7]], align 8 +; CHECK-NEXT: [[MUL9:%.*]] = fmul double [[CONV]], [[TMP2]] +; CHECK-NEXT: store double [[MUL9]], double* [[ARRAYIDX7]], align 8 ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds double, double* [[A]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load double, double* [[ARRAYIDX11]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[N]], 4 ; CHECK-NEXT: [[CONV12:%.*]] = sitofp i32 [[ADD]] to double -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV12]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[MUL13:%.*]] = fmul double [[CONV12]], [[TMP3]] +; CHECK-NEXT: store double [[MUL13]], double* [[ARRAYIDX11]], align 8 ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: ; CHECK-NEXT: ret i32 0 Index: test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll +++ test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll @@ -37,25 +37,39 @@ define void @fn2(i32* %a, i32* %b, float* %c) { ; CHECK-LABEL: @fn2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 1 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 1 +; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[I0]], [[I1]] +; CHECK-NEXT: [[FP1:%.*]] = sitofp i32 [[ADD1]] to float +; CHECK-NEXT: [[CALL1:%.*]] = tail call float @llvm.powi.f32(float [[FP1]], i32 [[ADD1]]) #2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1 +; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1 +; CHECK-NEXT: [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[I2]], [[I3]] +; CHECK-NEXT: [[FP2:%.*]] = sitofp i32 [[ADD2]] to float +; CHECK-NEXT: [[CALL2:%.*]] = tail call float @llvm.powi.f32(float [[FP2]], i32 [[ADD1]]) #2 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2 +; CHECK-NEXT: [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2 +; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[I4]], [[I5]] +; CHECK-NEXT: [[FP3:%.*]] = sitofp i32 [[ADD3]] to float +; CHECK-NEXT: [[CALL3:%.*]] = tail call float @llvm.powi.f32(float [[FP3]], i32 [[ADD1]]) #2 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.powi.v4f32(<4 x float> [[TMP5]], i32 [[TMP6]]) -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1 +; CHECK-NEXT: [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[I6]], [[I7]] +; CHECK-NEXT: [[FP4:%.*]] = sitofp i32 [[ADD4]] to float +; CHECK-NEXT: [[CALL4:%.*]] = tail call float @llvm.powi.f32(float [[FP4]], i32 [[ADD1]]) #2 +; CHECK-NEXT: store float [[CALL1]], float* [[C:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C]], i32 1 +; CHECK-NEXT: store float [[CALL2]], float* [[ARRAYIDX8]], align 4 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2 +; CHECK-NEXT: store float [[CALL3]], float* [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[C]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 +; CHECK-NEXT: store float [[CALL4]], float* [[ARRAYIDX10]], align 4 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/horizontal-list.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -13,25 +13,27 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]] -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]] +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16 +; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4 +; CHECK-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8 +; CHECK-NEXT: [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4 +; CHECK-NEXT: [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]] ; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]] -; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]] -; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]] -; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]] -; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]] +; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL4]], [[ADD7]] +; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD19]] +; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD19_1]] +; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD19_2]] ; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4 ; CHECK-NEXT: ret float [[ADD19_3]] ; @@ -1060,15 +1062,17 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP2]], [[ADD_1]] ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP3]], [[ADD_2]] ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4 +; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP4]], [[ADD_3]] ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]] -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]] -; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]] -; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4 +; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP5]], [[ADD_4]] ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8 ; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9 @@ -1077,8 +1081,8 @@ ; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12 ; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13 ; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>* -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4 ; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]] ; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]] ; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]] @@ -1103,8 +1107,8 @@ ; CHECK-NEXT: [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28 ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* -; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* +; CHECK-NEXT: [[TMP9:%.*]] = load <16 x float>, <16 x float>* [[TMP8]], align 4 ; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]] ; CHECK-NEXT: [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]] ; CHECK-NEXT: [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]] @@ -1120,33 +1124,31 @@ ; CHECK-NEXT: [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]] ; CHECK-NEXT: [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]] ; CHECK-NEXT: [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> undef, <16 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP9]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]] ; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 -; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP5]], [[RDX_SHUF7]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP7]], [[RDX_SHUF7]] ; CHECK-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <8 x float> [[BIN_RDX8]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: [[BIN_RDX10:%.*]] = fadd fast <8 x float> [[BIN_RDX8]], [[RDX_SHUF9]] ; CHECK-NEXT: [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0 -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]] -; CHECK-NEXT: [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0 -; CHECK-NEXT: [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0 +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[OP_RDX]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast float [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast float [[TMP13]], [[TMP3]] +; CHECK-NEXT: [[TMP15:%.*]] = fadd fast float [[TMP14]], [[TMP2]] +; CHECK-NEXT: [[TMP16:%.*]] = fadd fast float [[TMP15]], [[TMP1]] +; CHECK-NEXT: [[TMP17:%.*]] = fadd fast float [[TMP16]], [[TMP0]] ; CHECK-NEXT: [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]] -; CHECK-NEXT: ret float [[TMP12]] +; CHECK-NEXT: ret float [[TMP17]] ; ; THRESHOLD-LABEL: @loadadd31( ; THRESHOLD-NEXT: entry: Index: test/Transforms/SLPVectorizer/X86/horizontal.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/horizontal.ll +++ test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -948,23 +948,22 @@ ; STORE-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; STORE-NEXT: br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; STORE: for.body.lr.ph: -; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 -; STORE-NEXT: [[TMP0:%.*]] = bitcast double* [[B]] to <2 x double>* -; STORE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; STORE-NEXT: [[TMP0:%.*]] = load double, double* [[B:%.*]], align 8 +; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 +; STORE-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX4]], align 8 ; STORE-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64 ; STORE-NEXT: br label [[FOR_BODY:%.*]] ; STORE: for.body: ; STORE-NEXT: [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; STORE-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_018]], 2 ; STORE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]] +; STORE-NEXT: [[TMP3:%.*]] = load double, double* [[ARRAYIDX2]], align 8 +; STORE-NEXT: [[MUL3:%.*]] = fmul fast double [[TMP0]], [[TMP3]] ; STORE-NEXT: [[ADD16:%.*]] = or i64 [[MUL]], 1 ; STORE-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[ADD16]] -; STORE-NEXT: [[TMP3:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>* -; STORE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 -; STORE-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP4]] -; STORE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -; STORE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; STORE-NEXT: [[ADD8:%.*]] = fadd fast double [[TMP6]], [[TMP7]] +; STORE-NEXT: [[TMP4:%.*]] = load double, double* [[ARRAYIDX6]], align 8 +; STORE-NEXT: [[MUL7:%.*]] = fmul fast double [[TMP1]], [[TMP4]] +; STORE-NEXT: [[ADD8:%.*]] = fadd fast double [[MUL3]], [[MUL7]] ; STORE-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]] ; STORE-NEXT: store double [[ADD8]], double* [[ARRAYIDX9]], align 8 ; STORE-NEXT: [[INC]] = add nsw i64 [[I_018]], 1 @@ -1168,16 +1167,14 @@ ; ; STORE-LABEL: @float_red_example4( ; STORE-NEXT: entry: -; STORE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16 -; STORE-NEXT: [[ADD:%.*]] = fadd fast float undef, undef -; STORE-NEXT: [[ADD_1:%.*]] = fadd fast float undef, [[ADD]] -; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> -; STORE-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP0]], [[RDX_SHUF]] -; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> -; STORE-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] -; STORE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; STORE-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]] -; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 +; STORE-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16 +; STORE-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4 +; STORE-NEXT: [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]] +; STORE-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8 +; STORE-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; STORE-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4 +; STORE-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]] +; STORE-NEXT: store float [[ADD_2]], float* [[RES:%.*]], align 16 ; STORE-NEXT: ret void ; entry: @@ -1371,16 +1368,14 @@ ; ; STORE-LABEL: @i32_red_example4( ; STORE-NEXT: entry: -; STORE-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 -; STORE-NEXT: [[ADD:%.*]] = add nsw i32 undef, undef -; STORE-NEXT: [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]] -; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> -; STORE-NEXT: [[BIN_RDX:%.*]] = add nsw <4 x i32> [[TMP0]], [[RDX_SHUF]] -; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> -; STORE-NEXT: [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; STORE-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 -; STORE-NEXT: [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]] -; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 +; STORE-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 +; STORE-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 +; STORE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; STORE-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 +; STORE-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]] +; STORE-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 +; STORE-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]] +; STORE-NEXT: store i32 [[ADD_2]], i32* [[RES:%.*]], align 16 ; STORE-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/in-tree-user.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/in-tree-user.ll +++ test/Transforms/SLPVectorizer/X86/in-tree-user.ll @@ -11,22 +11,23 @@ ; CHECK-LABEL: @in_tree_user( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[CONV]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[CONV]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shl nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[INTREEUSER:%.*]] = fadd double [[TMP8]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[CMP11:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[MUL1:%.*]] = fmul double [[CONV]], [[TMP1]] +; CHECK-NEXT: [[MUL2:%.*]] = fmul double [[MUL1]], 7.000000e+00 +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL2]], 5.000000e+00 +; CHECK-NEXT: [[INTREEUSER:%.*]] = fadd double [[ADD]], [[ADD]] +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = load double, double* [[ARRAYIDX6]], align 8 +; CHECK-NEXT: [[MUL8:%.*]] = fmul double [[CONV]], [[TMP3]] +; CHECK-NEXT: [[MUL9:%.*]] = fmul double [[MUL8]], 4.000000e+00 +; CHECK-NEXT: [[ADD10:%.*]] = fadd double [[MUL9]], 9.000000e+00 +; CHECK-NEXT: [[CMP11:%.*]] = fcmp ogt double [[ADD]], [[ADD10]] ; CHECK-NEXT: br i1 [[CMP11]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i64 0, i64 0)) Index: test/Transforms/SLPVectorizer/X86/insertvalue.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/insertvalue.ll +++ test/Transforms/SLPVectorizer/X86/insertvalue.ll @@ -5,23 +5,23 @@ ; CHECK-LABEL: @julia_2xdouble( ; CHECK-NEXT: top: ; CHECK-NEXT: [[PX0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[X0:%.*]] = load double, double* [[PX0]], align 4 ; CHECK-NEXT: [[PY0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[Y0:%.*]] = load double, double* [[PY0]], align 4 +; CHECK-NEXT: [[M0:%.*]] = fmul double [[X0]], [[Y0]] ; CHECK-NEXT: [[PX1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2]], i64 0, i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[PX0]] to <2 x double>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 4 +; CHECK-NEXT: [[X1:%.*]] = load double, double* [[PX1]], align 4 ; CHECK-NEXT: [[PY1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3]], i64 0, i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[PY0]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[Y1:%.*]] = load double, double* [[PY1]], align 4 +; CHECK-NEXT: [[M1:%.*]] = fmul double [[X1]], [[Y1]] ; CHECK-NEXT: [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[Z0:%.*]] = load double, double* [[PZ0]], align 4 +; CHECK-NEXT: [[A0:%.*]] = fadd double [[M0]], [[Z0]] +; CHECK-NEXT: [[I0:%.*]] = insertvalue [2 x double] undef, double [[A0]], 0 ; CHECK-NEXT: [[PZ1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1]], i64 0, i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[PZ0]] to <2 x double>* -; CHECK-NEXT: [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0 -; CHECK-NEXT: [[I0:%.*]] = insertvalue [2 x double] undef, double [[TMP12]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP11]], i32 1 -; CHECK-NEXT: [[I1:%.*]] = insertvalue [2 x double] [[I0]], double [[TMP13]], 1 +; CHECK-NEXT: [[Z1:%.*]] = load double, double* [[PZ1]], align 4 +; CHECK-NEXT: [[A1:%.*]] = fadd double [[M1]], [[Z1]] +; CHECK-NEXT: [[I1:%.*]] = insertvalue [2 x double] [[I0]], double [[A1]], 1 ; CHECK-NEXT: store [2 x double] [[I1]], [2 x double]* [[TMP0:%.*]], align 4 ; CHECK-NEXT: ret void ; Index: test/Transforms/SLPVectorizer/X86/long_chains.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/long_chains.ll +++ test/Transforms/SLPVectorizer/X86/long_chains.ll @@ -9,26 +9,26 @@ define i32 @test(double* nocapture %A, i8* nocapture %B) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to <2 x i8>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i8> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i8> undef, i8 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i8> [[TMP4]], i8 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = sitofp <2 x i8> [[TMP6]] to <2 x double> -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], -; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[TMP11]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x double> [[TMP12]], -; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[TMP13]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x double> [[TMP14]], -; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x double> [[TMP15]], [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x double> [[TMP16]], -; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP17]], <2 x double>* [[TMP18]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[B:%.*]], align 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP0]], 3 +; CHECK-NEXT: [[ADD4:%.*]] = add i8 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> undef, i8 [[ADD]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i8> [[TMP2]], i8 [[ADD4]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double> +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP8]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x double> [[TMP10]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], +; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x double> [[TMP12]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd <2 x double> [[TMP13]], +; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: Index: test/Transforms/SLPVectorizer/X86/pr35497.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/pr35497.ll +++ test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -15,17 +15,16 @@ ; CHECK-NEXT: [[OR_1:%.*]] = or i64 undef, 1 ; CHECK-NEXT: store i64 [[OR_1]], i64* undef, align 8 ; CHECK-NEXT: [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 +; CHECK-NEXT: [[FOO_3:%.*]] = load i64, i64* [[FOO_1]], align 8 ; CHECK-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 +; CHECK-NEXT: [[FOO_4:%.*]] = load i64, i64* [[FOO_2]], align 8 ; CHECK-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[AND_2:%.*]] = and i64 [[OR_1]], [[FOO_3]] +; CHECK-NEXT: [[AND_1:%.*]] = and i64 [[BAR5]], [[FOO_4]] ; CHECK-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 +; CHECK-NEXT: store i64 [[AND_2]], i64* [[BAR3]], align 8 ; CHECK-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 +; CHECK-NEXT: store i64 [[AND_1]], i64* [[BAR4]], align 8 ; CHECK-NEXT: ret void ; for.body.lr.ph.i: Index: test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll +++ test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll @@ -15,33 +15,84 @@ ; Vector cost is 6, Scalar cost is 7 ; SSE: Adding cost -1 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction) define i32 @test_add(i32* nocapture readonly %p) { -; CHECK-LABEL: @test_add( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 -; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 -; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 -; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = add i32 undef, undef -; CHECK-NEXT: [[MUL_29:%.*]] = add i32 undef, [[MUL_18]] -; CHECK-NEXT: [[MUL_310:%.*]] = add i32 undef, [[MUL_29]] -; CHECK-NEXT: [[MUL_411:%.*]] = add i32 undef, [[MUL_310]] -; CHECK-NEXT: [[MUL_512:%.*]] = add i32 undef, [[MUL_411]] -; CHECK-NEXT: [[MUL_613:%.*]] = add i32 undef, [[MUL_512]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[MUL_714:%.*]] = add i32 undef, [[MUL_613]] -; CHECK-NEXT: ret i32 [[TMP2]] +; AVX-LABEL: @test_add( +; AVX-NEXT: entry: +; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; AVX-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; AVX-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; AVX-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; AVX-NEXT: [[MUL_18:%.*]] = add i32 undef, undef +; AVX-NEXT: [[MUL_29:%.*]] = add i32 undef, [[MUL_18]] +; AVX-NEXT: [[MUL_310:%.*]] = add i32 undef, [[MUL_29]] +; AVX-NEXT: [[MUL_411:%.*]] = add i32 undef, [[MUL_310]] +; AVX-NEXT: [[MUL_512:%.*]] = add i32 undef, [[MUL_411]] +; AVX-NEXT: [[MUL_613:%.*]] = add i32 undef, [[MUL_512]] +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX-NEXT: [[MUL_714:%.*]] = add i32 undef, [[MUL_613]] +; AVX-NEXT: ret i32 [[TMP2]] +; +; SSE-LABEL: @test_add( +; SSE-NEXT: entry: +; SSE-NEXT: [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4 +; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; SSE-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; SSE-NEXT: [[MUL_18:%.*]] = add i32 [[TMP1]], [[TMP0]] +; SSE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; SSE-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; SSE-NEXT: [[MUL_29:%.*]] = add i32 [[TMP2]], [[MUL_18]] +; SSE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; SSE-NEXT: [[MUL_310:%.*]] = add i32 [[TMP3]], [[MUL_29]] +; SSE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4 +; SSE-NEXT: [[MUL_411:%.*]] = add i32 [[TMP4]], [[MUL_310]] +; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; SSE-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4 +; SSE-NEXT: [[MUL_512:%.*]] = add i32 [[TMP5]], [[MUL_411]] +; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; SSE-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4 +; SSE-NEXT: [[MUL_613:%.*]] = add i32 [[TMP6]], [[MUL_512]] +; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; SSE-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4 +; SSE-NEXT: [[MUL_714:%.*]] = add i32 [[TMP7]], [[MUL_613]] +; SSE-NEXT: ret i32 [[MUL_714]] ; +; SSE2-LABEL: @test( +; SSE2-NEXT: entry: +; SSE2-NEXT: [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4 +; SSE2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; SSE2-NEXT: [[MUL_18:%.*]] = add i32 [[TMP1]], [[TMP0]] +; SSE2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; SSE2-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; SSE2-NEXT: [[MUL_29:%.*]] = add i32 [[TMP2]], [[MUL_18]] +; SSE2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; SSE2-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; SSE2-NEXT: [[MUL_310:%.*]] = add i32 [[TMP3]], [[MUL_29]] +; SSE2-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; SSE2-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4 +; SSE2-NEXT: [[MUL_411:%.*]] = add i32 [[TMP4]], [[MUL_310]] +; SSE2-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; SSE2-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4 +; SSE2-NEXT: [[MUL_512:%.*]] = add i32 [[TMP5]], [[MUL_411]] +; SSE2-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; SSE2-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4 +; SSE2-NEXT: [[MUL_613:%.*]] = add i32 [[TMP6]], [[MUL_512]] +; SSE2-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; SSE2-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4 +; SSE2-NEXT: [[MUL_714:%.*]] = add i32 [[TMP7]], [[MUL_613]] +; SSE2-NEXT: ret i32 [[MUL_714]] entry: %0 = load i32, i32* %p, align 4 %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1 @@ -138,30 +189,29 @@ define i32 @test_and(i32* nocapture readonly %p) { ; CHECK-LABEL: @test_and( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[MUL_18:%.*]] = and i32 [[TMP1]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[MUL_29:%.*]] = and i32 [[TMP2]], [[MUL_18]] ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[MUL_310:%.*]] = and i32 [[TMP3]], [[MUL_29]] ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4 +; CHECK-NEXT: [[MUL_411:%.*]] = and i32 [[TMP4]], [[MUL_310]] ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4 +; CHECK-NEXT: [[MUL_512:%.*]] = and i32 [[TMP5]], [[MUL_411]] ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4 +; CHECK-NEXT: [[MUL_613:%.*]] = and i32 [[TMP6]], [[MUL_512]] ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = and i32 undef, undef -; CHECK-NEXT: [[MUL_29:%.*]] = and i32 undef, [[MUL_18]] -; CHECK-NEXT: [[MUL_310:%.*]] = and i32 undef, [[MUL_29]] -; CHECK-NEXT: [[MUL_411:%.*]] = and i32 undef, [[MUL_310]] -; CHECK-NEXT: [[MUL_512:%.*]] = and i32 undef, [[MUL_411]] -; CHECK-NEXT: [[MUL_613:%.*]] = and i32 undef, [[MUL_512]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[MUL_714:%.*]] = and i32 undef, [[MUL_613]] -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4 +; CHECK-NEXT: [[MUL_714:%.*]] = and i32 [[TMP7]], [[MUL_613]] +; CHECK-NEXT: ret i32 [[MUL_714]] ; entry: %0 = load i32, i32* %p, align 4 @@ -199,30 +249,29 @@ define i32 @test_or(i32* nocapture readonly %p) { ; CHECK-LABEL: @test_or( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[MUL_18:%.*]] = or i32 [[TMP1]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[MUL_29:%.*]] = or i32 [[TMP2]], [[MUL_18]] ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[MUL_310:%.*]] = or i32 [[TMP3]], [[MUL_29]] ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4 +; CHECK-NEXT: [[MUL_411:%.*]] = or i32 [[TMP4]], [[MUL_310]] ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4 +; CHECK-NEXT: [[MUL_512:%.*]] = or i32 [[TMP5]], [[MUL_411]] ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4 +; CHECK-NEXT: [[MUL_613:%.*]] = or i32 [[TMP6]], [[MUL_512]] ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = or i32 undef, undef -; CHECK-NEXT: [[MUL_29:%.*]] = or i32 undef, [[MUL_18]] -; CHECK-NEXT: [[MUL_310:%.*]] = or i32 undef, [[MUL_29]] -; CHECK-NEXT: [[MUL_411:%.*]] = or i32 undef, [[MUL_310]] -; CHECK-NEXT: [[MUL_512:%.*]] = or i32 undef, [[MUL_411]] -; CHECK-NEXT: [[MUL_613:%.*]] = or i32 undef, [[MUL_512]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[MUL_714:%.*]] = or i32 undef, [[MUL_613]] -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4 +; CHECK-NEXT: [[MUL_714:%.*]] = or i32 [[TMP7]], [[MUL_613]] +; CHECK-NEXT: ret i32 [[MUL_714]] ; entry: %0 = load i32, i32* %p, align 4 @@ -260,30 +309,29 @@ define i32 @test_xor(i32* nocapture readonly %p) { ; CHECK-LABEL: @test_xor( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[MUL_18:%.*]] = xor i32 [[TMP1]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[MUL_29:%.*]] = xor i32 [[TMP2]], [[MUL_18]] ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[MUL_310:%.*]] = xor i32 [[TMP3]], [[MUL_29]] ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4 +; CHECK-NEXT: [[MUL_411:%.*]] = xor i32 [[TMP4]], [[MUL_310]] ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4 +; CHECK-NEXT: [[MUL_512:%.*]] = xor i32 [[TMP5]], [[MUL_411]] ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4 +; CHECK-NEXT: [[MUL_613:%.*]] = xor i32 [[TMP6]], [[MUL_512]] ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = xor i32 undef, undef -; CHECK-NEXT: [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]] -; CHECK-NEXT: [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]] -; CHECK-NEXT: [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]] -; CHECK-NEXT: [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]] -; CHECK-NEXT: [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]] -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4 +; CHECK-NEXT: [[MUL_714:%.*]] = xor i32 [[TMP7]], [[MUL_613]] +; CHECK-NEXT: ret i32 [[MUL_714]] ; entry: %0 = load i32, i32* %p, align 4 Index: test/Transforms/SLPVectorizer/X86/remark_horcost.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/remark_horcost.ll +++ test/Transforms/SLPVectorizer/X86/remark_horcost.ll @@ -114,7 +114,7 @@ ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' - ; YAML-NEXT: - Cost: '-8' + ; YAML-NEXT: - Cost: '-4' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '4' Index: test/Transforms/SLPVectorizer/X86/shift-ashr.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/shift-ashr.ll +++ test/Transforms/SLPVectorizer/X86/shift-ashr.ll @@ -224,22 +224,70 @@ ; SSE-NEXT: ret void ; ; AVX1-LABEL: @ashr_v16i32( -; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; AVX1-NEXT: [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]] -; AVX1-NEXT: [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]] -; AVX1-NEXT: [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]] -; AVX1-NEXT: [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]] -; AVX1-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; AVX1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; AVX1-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 -; AVX1-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX1-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 +; AVX1-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 +; AVX1-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 +; AVX1-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 +; AVX1-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 +; AVX1-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 +; AVX1-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 +; AVX1-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 +; AVX1-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 +; AVX1-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 +; AVX1-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 +; AVX1-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 +; AVX1-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 +; AVX1-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 +; AVX1-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 +; AVX1-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 +; AVX1-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 +; AVX1-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 +; AVX1-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 +; AVX1-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 +; AVX1-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 +; AVX1-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 +; AVX1-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 +; AVX1-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 +; AVX1-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 +; AVX1-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 +; AVX1-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 +; AVX1-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 +; AVX1-NEXT: [[R0:%.*]] = ashr i32 [[A0]], [[B0]] +; AVX1-NEXT: [[R1:%.*]] = ashr i32 [[A1]], [[B1]] +; AVX1-NEXT: [[R2:%.*]] = ashr i32 [[A2]], [[B2]] +; AVX1-NEXT: [[R3:%.*]] = ashr i32 [[A3]], [[B3]] +; AVX1-NEXT: [[R4:%.*]] = ashr i32 [[A4]], [[B4]] +; AVX1-NEXT: [[R5:%.*]] = ashr i32 [[A5]], [[B5]] +; AVX1-NEXT: [[R6:%.*]] = ashr i32 [[A6]], [[B6]] +; AVX1-NEXT: [[R7:%.*]] = ashr i32 [[A7]], [[B7]] +; AVX1-NEXT: [[R8:%.*]] = ashr i32 [[A8]], [[B8]] +; AVX1-NEXT: [[R9:%.*]] = ashr i32 [[A9]], [[B9]] +; AVX1-NEXT: [[R10:%.*]] = ashr i32 [[A10]], [[B10]] +; AVX1-NEXT: [[R11:%.*]] = ashr i32 [[A11]], [[B11]] +; AVX1-NEXT: [[R12:%.*]] = ashr i32 [[A12]], [[B12]] +; AVX1-NEXT: [[R13:%.*]] = ashr i32 [[A13]], [[B13]] +; AVX1-NEXT: [[R14:%.*]] = ashr i32 [[A14]], [[B14]] +; AVX1-NEXT: [[R15:%.*]] = ashr i32 [[A15]], [[B15]] +; AVX1-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 +; AVX1-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 +; AVX1-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 +; AVX1-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 +; AVX1-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 +; AVX1-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 +; AVX1-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 +; AVX1-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 +; AVX1-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 +; AVX1-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 +; AVX1-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 +; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 +; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 +; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 +; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 +; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @ashr_v16i32( @@ -635,24 +683,321 @@ } define void @ashr_v64i8() { -; CHECK-LABEL: @ashr_v64i8( -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; CHECK-NEXT: [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]] -; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 -; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 -; CHECK-NEXT: ret void +; SSE-LABEL: @ashr_v64i8( +; SSE-NEXT: [[A0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0), align 1 +; SSE-NEXT: [[A1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1), align 1 +; SSE-NEXT: [[A2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2), align 1 +; SSE-NEXT: [[A3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3), align 1 +; SSE-NEXT: [[A4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4), align 1 +; SSE-NEXT: [[A5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5), align 1 +; SSE-NEXT: [[A6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6), align 1 +; SSE-NEXT: [[A7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7), align 1 +; SSE-NEXT: [[A8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8), align 1 +; SSE-NEXT: [[A9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9), align 1 +; SSE-NEXT: [[A10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1 +; SSE-NEXT: [[A11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1 +; SSE-NEXT: [[A12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1 +; SSE-NEXT: [[A13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1 +; SSE-NEXT: [[A14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1 +; SSE-NEXT: [[A15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1 +; SSE-NEXT: [[A16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1 +; SSE-NEXT: [[A17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1 +; SSE-NEXT: [[A18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1 +; SSE-NEXT: [[A19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1 +; SSE-NEXT: [[A20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1 +; SSE-NEXT: [[A21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1 +; SSE-NEXT: [[A22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1 +; SSE-NEXT: [[A23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1 +; SSE-NEXT: [[A24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1 +; SSE-NEXT: [[A25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1 +; SSE-NEXT: [[A26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1 +; SSE-NEXT: [[A27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1 +; SSE-NEXT: [[A28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1 +; SSE-NEXT: [[A29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1 +; SSE-NEXT: [[A30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1 +; SSE-NEXT: [[A31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1 +; SSE-NEXT: [[A32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1 +; SSE-NEXT: [[A33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1 +; SSE-NEXT: [[A34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1 +; SSE-NEXT: [[A35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1 +; SSE-NEXT: [[A36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1 +; SSE-NEXT: [[A37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1 +; SSE-NEXT: [[A38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1 +; SSE-NEXT: [[A39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1 +; SSE-NEXT: [[A40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1 +; SSE-NEXT: [[A41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1 +; SSE-NEXT: [[A42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1 +; SSE-NEXT: [[A43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1 +; SSE-NEXT: [[A44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1 +; SSE-NEXT: [[A45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1 +; SSE-NEXT: [[A46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1 +; SSE-NEXT: [[A47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1 +; SSE-NEXT: [[A48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[A49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1 +; SSE-NEXT: [[A50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1 +; SSE-NEXT: [[A51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1 +; SSE-NEXT: [[A52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1 +; SSE-NEXT: [[A53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1 +; SSE-NEXT: [[A54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1 +; SSE-NEXT: [[A55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1 +; SSE-NEXT: [[A56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1 +; SSE-NEXT: [[A57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1 +; SSE-NEXT: [[A58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1 +; SSE-NEXT: [[A59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1 +; SSE-NEXT: [[A60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1 +; SSE-NEXT: [[A61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1 +; SSE-NEXT: [[A62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1 +; SSE-NEXT: [[A63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1 +; SSE-NEXT: [[B0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0), align 1 +; SSE-NEXT: [[B1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1), align 1 +; SSE-NEXT: [[B2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2), align 1 +; SSE-NEXT: [[B3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3), align 1 +; SSE-NEXT: [[B4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4), align 1 +; SSE-NEXT: [[B5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5), align 1 +; SSE-NEXT: [[B6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6), align 1 +; SSE-NEXT: [[B7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7), align 1 +; SSE-NEXT: [[B8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8), align 1 +; SSE-NEXT: [[B9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9), align 1 +; SSE-NEXT: [[B10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1 +; SSE-NEXT: [[B11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1 +; SSE-NEXT: [[B12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1 +; SSE-NEXT: [[B13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1 +; SSE-NEXT: [[B14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1 +; SSE-NEXT: [[B15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1 +; SSE-NEXT: [[B16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1 +; SSE-NEXT: [[B17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1 +; SSE-NEXT: [[B18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1 +; SSE-NEXT: [[B19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1 +; SSE-NEXT: [[B20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1 +; SSE-NEXT: [[B21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1 +; SSE-NEXT: [[B22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1 +; SSE-NEXT: [[B23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1 +; SSE-NEXT: [[B24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1 +; SSE-NEXT: [[B25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1 +; SSE-NEXT: [[B26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1 +; SSE-NEXT: [[B27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1 +; SSE-NEXT: [[B28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1 +; SSE-NEXT: [[B29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1 +; SSE-NEXT: [[B30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1 +; SSE-NEXT: [[B31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1 +; SSE-NEXT: [[B32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1 +; SSE-NEXT: [[B33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1 +; SSE-NEXT: [[B34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1 +; SSE-NEXT: [[B35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1 +; SSE-NEXT: [[B36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1 +; SSE-NEXT: [[B37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1 +; SSE-NEXT: [[B38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1 +; SSE-NEXT: [[B39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1 +; SSE-NEXT: [[B40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1 +; SSE-NEXT: [[B41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1 +; SSE-NEXT: [[B42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1 +; SSE-NEXT: [[B43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1 +; SSE-NEXT: [[B44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1 +; SSE-NEXT: [[B45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1 +; SSE-NEXT: [[B46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1 +; SSE-NEXT: [[B47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1 +; SSE-NEXT: [[B48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[B49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1 +; SSE-NEXT: [[B50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1 +; SSE-NEXT: [[B51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1 +; SSE-NEXT: [[B52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1 +; SSE-NEXT: [[B53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1 +; SSE-NEXT: [[B54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1 +; SSE-NEXT: [[B55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1 +; SSE-NEXT: [[B56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1 +; SSE-NEXT: [[B57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1 +; SSE-NEXT: [[B58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1 +; SSE-NEXT: [[B59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1 +; SSE-NEXT: [[B60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1 +; SSE-NEXT: [[B61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1 +; SSE-NEXT: [[B62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1 +; SSE-NEXT: [[B63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1 +; SSE-NEXT: [[R0:%.*]] = ashr i8 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = ashr i8 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = ashr i8 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = ashr i8 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = ashr i8 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = ashr i8 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = ashr i8 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = ashr i8 [[A7]], [[B7]] +; SSE-NEXT: [[R8:%.*]] = ashr i8 [[A8]], [[B8]] +; SSE-NEXT: [[R9:%.*]] = ashr i8 [[A9]], [[B9]] +; SSE-NEXT: [[R10:%.*]] = ashr i8 [[A10]], [[B10]] +; SSE-NEXT: [[R11:%.*]] = ashr i8 [[A11]], [[B11]] +; SSE-NEXT: [[R12:%.*]] = ashr i8 [[A12]], [[B12]] +; SSE-NEXT: [[R13:%.*]] = ashr i8 [[A13]], [[B13]] +; SSE-NEXT: [[R14:%.*]] = ashr i8 [[A14]], [[B14]] +; SSE-NEXT: [[R15:%.*]] = ashr i8 [[A15]], [[B15]] +; SSE-NEXT: [[R16:%.*]] = ashr i8 [[A16]], [[B16]] +; SSE-NEXT: [[R17:%.*]] = ashr i8 [[A17]], [[B17]] +; SSE-NEXT: [[R18:%.*]] = ashr i8 [[A18]], [[B18]] +; SSE-NEXT: [[R19:%.*]] = ashr i8 [[A19]], [[B19]] +; SSE-NEXT: [[R20:%.*]] = ashr i8 [[A20]], [[B20]] +; SSE-NEXT: [[R21:%.*]] = ashr i8 [[A21]], [[B21]] +; SSE-NEXT: [[R22:%.*]] = ashr i8 [[A22]], [[B22]] +; SSE-NEXT: [[R23:%.*]] = ashr i8 [[A23]], [[B23]] +; SSE-NEXT: [[R24:%.*]] = ashr i8 [[A24]], [[B24]] +; SSE-NEXT: [[R25:%.*]] = ashr i8 [[A25]], [[B25]] +; SSE-NEXT: [[R26:%.*]] = ashr i8 [[A26]], [[B26]] +; SSE-NEXT: [[R27:%.*]] = ashr i8 [[A27]], [[B27]] +; SSE-NEXT: [[R28:%.*]] = ashr i8 [[A28]], [[B28]] +; SSE-NEXT: [[R29:%.*]] = ashr i8 [[A29]], [[B29]] +; SSE-NEXT: [[R30:%.*]] = ashr i8 [[A30]], [[B30]] +; SSE-NEXT: [[R31:%.*]] = ashr i8 [[A31]], [[B31]] +; SSE-NEXT: [[R32:%.*]] = ashr i8 [[A32]], [[B32]] +; SSE-NEXT: [[R33:%.*]] = ashr i8 [[A33]], [[B33]] +; SSE-NEXT: [[R34:%.*]] = ashr i8 [[A34]], [[B34]] +; SSE-NEXT: [[R35:%.*]] = ashr i8 [[A35]], [[B35]] +; SSE-NEXT: [[R36:%.*]] = ashr i8 [[A36]], [[B36]] +; SSE-NEXT: [[R37:%.*]] = ashr i8 [[A37]], [[B37]] +; SSE-NEXT: [[R38:%.*]] = ashr i8 [[A38]], [[B38]] +; SSE-NEXT: [[R39:%.*]] = ashr i8 [[A39]], [[B39]] +; SSE-NEXT: [[R40:%.*]] = ashr i8 [[A40]], [[B40]] +; SSE-NEXT: [[R41:%.*]] = ashr i8 [[A41]], [[B41]] +; SSE-NEXT: [[R42:%.*]] = ashr i8 [[A42]], [[B42]] +; SSE-NEXT: [[R43:%.*]] = ashr i8 [[A43]], [[B43]] +; SSE-NEXT: [[R44:%.*]] = ashr i8 [[A44]], [[B44]] +; SSE-NEXT: [[R45:%.*]] = ashr i8 [[A45]], [[B45]] +; SSE-NEXT: [[R46:%.*]] = ashr i8 [[A46]], [[B46]] +; SSE-NEXT: [[R47:%.*]] = ashr i8 [[A47]], [[B47]] +; SSE-NEXT: [[R48:%.*]] = ashr i8 [[A48]], [[B48]] +; SSE-NEXT: [[R49:%.*]] = ashr i8 [[A49]], [[B49]] +; SSE-NEXT: [[R50:%.*]] = ashr i8 [[A50]], [[B50]] +; SSE-NEXT: [[R51:%.*]] = ashr i8 [[A51]], [[B51]] +; SSE-NEXT: [[R52:%.*]] = ashr i8 [[A52]], [[B52]] +; SSE-NEXT: [[R53:%.*]] = ashr i8 [[A53]], [[B53]] +; SSE-NEXT: [[R54:%.*]] = ashr i8 [[A54]], [[B54]] +; SSE-NEXT: [[R55:%.*]] = ashr i8 [[A55]], [[B55]] +; SSE-NEXT: [[R56:%.*]] = ashr i8 [[A56]], [[B56]] +; SSE-NEXT: [[R57:%.*]] = ashr i8 [[A57]], [[B57]] +; SSE-NEXT: [[R58:%.*]] = ashr i8 [[A58]], [[B58]] +; SSE-NEXT: [[R59:%.*]] = ashr i8 [[A59]], [[B59]] +; SSE-NEXT: [[R60:%.*]] = ashr i8 [[A60]], [[B60]] +; SSE-NEXT: [[R61:%.*]] = ashr i8 [[A61]], [[B61]] +; SSE-NEXT: [[R62:%.*]] = ashr i8 [[A62]], [[B62]] +; SSE-NEXT: [[R63:%.*]] = ashr i8 [[A63]], [[B63]] +; SSE-NEXT: store i8 [[R0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0), align 1 +; SSE-NEXT: store i8 [[R1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1), align 1 +; SSE-NEXT: store i8 [[R2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2), align 1 +; SSE-NEXT: store i8 [[R3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3), align 1 +; SSE-NEXT: store i8 [[R4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4), align 1 +; SSE-NEXT: store i8 [[R5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5), align 1 +; SSE-NEXT: store i8 [[R6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6), align 1 +; SSE-NEXT: store i8 [[R7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7), align 1 +; SSE-NEXT: store i8 [[R8]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8), align 1 +; SSE-NEXT: store i8 [[R9]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9), align 1 +; SSE-NEXT: store i8 [[R10]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1 +; SSE-NEXT: store i8 [[R11]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1 +; SSE-NEXT: store i8 [[R12]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1 +; SSE-NEXT: store i8 [[R13]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1 +; SSE-NEXT: store i8 [[R14]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1 +; SSE-NEXT: store i8 [[R15]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1 +; SSE-NEXT: store i8 [[R16]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1 +; SSE-NEXT: store i8 [[R17]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1 +; SSE-NEXT: store i8 [[R18]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1 +; SSE-NEXT: store i8 [[R19]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1 +; SSE-NEXT: store i8 [[R20]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1 +; SSE-NEXT: store i8 [[R21]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1 +; SSE-NEXT: store i8 [[R22]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1 +; SSE-NEXT: store i8 [[R23]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1 +; SSE-NEXT: store i8 [[R24]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1 +; SSE-NEXT: store i8 [[R25]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1 +; SSE-NEXT: store i8 [[R26]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1 +; SSE-NEXT: store i8 [[R27]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1 +; SSE-NEXT: store i8 [[R28]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1 +; SSE-NEXT: store i8 [[R29]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1 +; SSE-NEXT: store i8 [[R30]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1 +; SSE-NEXT: store i8 [[R31]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1 +; SSE-NEXT: store i8 [[R32]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1 +; SSE-NEXT: store i8 [[R33]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1 +; SSE-NEXT: store i8 [[R34]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1 +; SSE-NEXT: store i8 [[R35]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1 +; SSE-NEXT: store i8 [[R36]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1 +; SSE-NEXT: store i8 [[R37]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1 +; SSE-NEXT: store i8 [[R38]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1 +; SSE-NEXT: store i8 [[R39]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1 +; SSE-NEXT: store i8 [[R40]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1 +; SSE-NEXT: store i8 [[R41]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1 +; SSE-NEXT: store i8 [[R42]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1 +; SSE-NEXT: store i8 [[R43]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1 +; SSE-NEXT: store i8 [[R44]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1 +; SSE-NEXT: store i8 [[R45]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1 +; SSE-NEXT: store i8 [[R46]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1 +; SSE-NEXT: store i8 [[R47]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1 +; SSE-NEXT: store i8 [[R48]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1 +; SSE-NEXT: store i8 [[R49]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1 +; SSE-NEXT: store i8 [[R50]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1 +; SSE-NEXT: store i8 [[R51]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1 +; SSE-NEXT: store i8 [[R52]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1 +; SSE-NEXT: store i8 [[R53]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1 +; SSE-NEXT: store i8 [[R54]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1 +; SSE-NEXT: store i8 [[R55]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1 +; SSE-NEXT: store i8 [[R56]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1 +; SSE-NEXT: store i8 [[R57]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1 +; SSE-NEXT: store i8 [[R58]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1 +; SSE-NEXT: store i8 [[R59]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1 +; SSE-NEXT: store i8 [[R60]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1 +; SSE-NEXT: store i8 [[R61]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1 +; SSE-NEXT: store i8 [[R62]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1 +; SSE-NEXT: store i8 [[R63]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1 +; SSE-NEXT: ret void +; +; AVX-LABEL: @ashr_v64i8( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; AVX-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; AVX-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; AVX-NEXT: [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]] +; AVX-NEXT: [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]] +; AVX-NEXT: [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]] +; AVX-NEXT: [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]] +; AVX-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; AVX-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; AVX-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; AVX-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @ashr_v64i8( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; AVX512-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; AVX512-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; AVX512-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; AVX512-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; AVX512-NEXT: [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]] +; AVX512-NEXT: [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]] +; AVX512-NEXT: [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]] +; AVX512-NEXT: [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]] +; AVX512-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; AVX512-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; AVX512-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; AVX512-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 +; AVX512-NEXT: ret void +; +; XOP-LABEL: @ashr_v64i8( +; XOP-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 +; XOP-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; XOP-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; XOP-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; XOP-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; XOP-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; XOP-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; XOP-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; XOP-NEXT: [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]] +; XOP-NEXT: [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]] +; XOP-NEXT: [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]] +; XOP-NEXT: [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]] +; XOP-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; XOP-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; XOP-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; XOP-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 +; XOP-NEXT: ret void ; %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1 %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1 Index: test/Transforms/SLPVectorizer/X86/shift-lshr.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/shift-lshr.ll +++ test/Transforms/SLPVectorizer/X86/shift-lshr.ll @@ -22,41 +22,73 @@ define void @lshr_v8i64() { ; SSE-LABEL: @lshr_v8i64( -; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 +; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 +; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 +; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 +; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 +; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 +; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 +; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 +; SSE-NEXT: [[R0:%.*]] = lshr i64 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = lshr i64 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = lshr i64 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = lshr i64 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = lshr i64 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = lshr i64 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = lshr i64 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = lshr i64 [[A7]], [[B7]] +; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 ; SSE-NEXT: ret void ; ; AVX1-LABEL: @lshr_v8i64( -; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]] -; AVX1-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]] -; AVX1-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]] -; AVX1-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]] -; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; AVX1-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; AVX1-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; AVX1-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; AVX1-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; AVX1-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; AVX1-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; AVX1-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; AVX1-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 +; AVX1-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 +; AVX1-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 +; AVX1-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 +; AVX1-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 +; AVX1-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 +; AVX1-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 +; AVX1-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 +; AVX1-NEXT: [[R0:%.*]] = lshr i64 [[A0]], [[B0]] +; AVX1-NEXT: [[R1:%.*]] = lshr i64 [[A1]], [[B1]] +; AVX1-NEXT: [[R2:%.*]] = lshr i64 [[A2]], [[B2]] +; AVX1-NEXT: [[R3:%.*]] = lshr i64 [[A3]], [[B3]] +; AVX1-NEXT: [[R4:%.*]] = lshr i64 [[A4]], [[B4]] +; AVX1-NEXT: [[R5:%.*]] = lshr i64 [[A5]], [[B5]] +; AVX1-NEXT: [[R6:%.*]] = lshr i64 [[A6]], [[B6]] +; AVX1-NEXT: [[R7:%.*]] = lshr i64 [[A7]], [[B7]] +; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @lshr_v8i64( @@ -191,16 +223,83 @@ ; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 ; SSE-NEXT: ret void ; -; AVX-LABEL: @lshr_v16i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 -; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: ret void +; AVX1-LABEL: @lshr_v16i32( +; AVX1-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 +; AVX1-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 +; AVX1-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 +; AVX1-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 +; AVX1-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 +; AVX1-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 +; AVX1-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 +; AVX1-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 +; AVX1-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 +; AVX1-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 +; AVX1-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 +; AVX1-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 +; AVX1-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 +; AVX1-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 +; AVX1-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 +; AVX1-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 +; AVX1-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 +; AVX1-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 +; AVX1-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 +; AVX1-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 +; AVX1-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 +; AVX1-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 +; AVX1-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 +; AVX1-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 +; AVX1-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 +; AVX1-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 +; AVX1-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 +; AVX1-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 +; AVX1-NEXT: [[R0:%.*]] = lshr i32 [[A0]], [[B0]] +; AVX1-NEXT: [[R1:%.*]] = lshr i32 [[A1]], [[B1]] +; AVX1-NEXT: [[R2:%.*]] = lshr i32 [[A2]], [[B2]] +; AVX1-NEXT: [[R3:%.*]] = lshr i32 [[A3]], [[B3]] +; AVX1-NEXT: [[R4:%.*]] = lshr i32 [[A4]], [[B4]] +; AVX1-NEXT: [[R5:%.*]] = lshr i32 [[A5]], [[B5]] +; AVX1-NEXT: [[R6:%.*]] = lshr i32 [[A6]], [[B6]] +; AVX1-NEXT: [[R7:%.*]] = lshr i32 [[A7]], [[B7]] +; AVX1-NEXT: [[R8:%.*]] = lshr i32 [[A8]], [[B8]] +; AVX1-NEXT: [[R9:%.*]] = lshr i32 [[A9]], [[B9]] +; AVX1-NEXT: [[R10:%.*]] = lshr i32 [[A10]], [[B10]] +; AVX1-NEXT: [[R11:%.*]] = lshr i32 [[A11]], [[B11]] +; AVX1-NEXT: [[R12:%.*]] = lshr i32 [[A12]], [[B12]] +; AVX1-NEXT: [[R13:%.*]] = lshr i32 [[A13]], [[B13]] +; AVX1-NEXT: [[R14:%.*]] = lshr i32 [[A14]], [[B14]] +; AVX1-NEXT: [[R15:%.*]] = lshr i32 [[A15]], [[B15]] +; AVX1-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 +; AVX1-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 +; AVX1-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 +; AVX1-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 +; AVX1-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 +; AVX1-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 +; AVX1-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 +; AVX1-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 +; AVX1-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 +; AVX1-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 +; AVX1-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 +; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 +; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 +; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 +; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 +; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @lshr_v16i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX2-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX2-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX2-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]] +; AVX2-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]] +; AVX2-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX2-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX2-NEXT: ret void ; ; AVX512-LABEL: @lshr_v16i32( ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 Index: test/Transforms/SLPVectorizer/X86/shift-shl.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/shift-shl.ll +++ test/Transforms/SLPVectorizer/X86/shift-shl.ll @@ -22,41 +22,73 @@ define void @shl_v8i64() { ; SSE-LABEL: @shl_v8i64( -; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 +; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 +; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 +; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 +; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 +; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 +; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 +; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 +; SSE-NEXT: [[R0:%.*]] = shl i64 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = shl i64 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = shl i64 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = shl i64 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = shl i64 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = shl i64 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = shl i64 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = shl i64 [[A7]], [[B7]] +; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 ; SSE-NEXT: ret void ; ; AVX1-LABEL: @shl_v8i64( -; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]] -; AVX1-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]] -; AVX1-NEXT: [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]] -; AVX1-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]] -; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; AVX1-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; AVX1-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; AVX1-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; AVX1-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; AVX1-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; AVX1-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; AVX1-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; AVX1-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 +; AVX1-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 +; AVX1-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 +; AVX1-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 +; AVX1-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 +; AVX1-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 +; AVX1-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 +; AVX1-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 +; AVX1-NEXT: [[R0:%.*]] = shl i64 [[A0]], [[B0]] +; AVX1-NEXT: [[R1:%.*]] = shl i64 [[A1]], [[B1]] +; AVX1-NEXT: [[R2:%.*]] = shl i64 [[A2]], [[B2]] +; AVX1-NEXT: [[R3:%.*]] = shl i64 [[A3]], [[B3]] +; AVX1-NEXT: [[R4:%.*]] = shl i64 [[A4]], [[B4]] +; AVX1-NEXT: [[R5:%.*]] = shl i64 [[A5]], [[B5]] +; AVX1-NEXT: [[R6:%.*]] = shl i64 [[A6]], [[B6]] +; AVX1-NEXT: [[R7:%.*]] = shl i64 [[A7]], [[B7]] +; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @shl_v8i64( @@ -125,22 +157,70 @@ define void @shl_v16i32() { ; SSE-LABEL: @shl_v16i32( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = shl <4 x i32> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = shl <4 x i32> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 +; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 +; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 +; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 +; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 +; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 +; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 +; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 +; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 +; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 +; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 +; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 +; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 +; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 +; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 +; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 +; SSE-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 +; SSE-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 +; SSE-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 +; SSE-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 +; SSE-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 +; SSE-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 +; SSE-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 +; SSE-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 +; SSE-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 +; SSE-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 +; SSE-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 +; SSE-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 +; SSE-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 +; SSE-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 +; SSE-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 +; SSE-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 +; SSE-NEXT: [[R0:%.*]] = shl i32 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = shl i32 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = shl i32 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = shl i32 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = shl i32 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = shl i32 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = shl i32 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = shl i32 [[A7]], [[B7]] +; SSE-NEXT: [[R8:%.*]] = shl i32 [[A8]], [[B8]] +; SSE-NEXT: [[R9:%.*]] = shl i32 [[A9]], [[B9]] +; SSE-NEXT: [[R10:%.*]] = shl i32 [[A10]], [[B10]] +; SSE-NEXT: [[R11:%.*]] = shl i32 [[A11]], [[B11]] +; SSE-NEXT: [[R12:%.*]] = shl i32 [[A12]], [[B12]] +; SSE-NEXT: [[R13:%.*]] = shl i32 [[A13]], [[B13]] +; SSE-NEXT: [[R14:%.*]] = shl i32 [[A14]], [[B14]] +; SSE-NEXT: [[R15:%.*]] = shl i32 [[A15]], [[B15]] +; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 +; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 +; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 +; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 +; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 +; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 +; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 +; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 +; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 +; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 +; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 +; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 +; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 +; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 +; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 +; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @shl_v16i32( Index: test/Transforms/SLPVectorizer/X86/supernode.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/supernode.ll +++ test/Transforms/SLPVectorizer/X86/supernode.ll @@ -76,18 +76,16 @@ ; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 ; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 ; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* -; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; ENABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 +; ENABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 ; ENABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; ENABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] -; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; ENABLED-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]] +; ENABLED-NEXT: [[SUBC1B1:%.*]] = fsub fast double [[C1]], [[B1]] +; ENABLED-NEXT: [[ADD0:%.*]] = fadd fast double [[SUBA0B0]], [[C0]] +; ENABLED-NEXT: [[ADD1:%.*]] = fadd fast double [[SUBC1B1]], [[A1]] +; ENABLED-NEXT: store double [[ADD0]], double* [[IDXS0]], align 8 +; ENABLED-NEXT: store double [[ADD1]], double* [[IDXS1]], align 8 ; ENABLED-NEXT: ret void ; entry: