Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -1809,6 +1809,38 @@ return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; } +static bool isFreeOp(const Instruction *I, const Type *Ty, + const Instruction *OpI) { + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + return !isa(I->getOperand(0)) || + !isa(I->getOperand(1)) || I->getOperand(0) == OpI; + case Instruction::Mul: + return (!isa(I->getOperand(0)) || + !isa(I->getOperand(1)) || I->getOperand(0) == OpI) && + Ty->getScalarSizeInBits() <= 64; + case Instruction::Shl: + case Instruction::AShr: + case Instruction::LShr: + return I->getOperand(0) == I; + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + return !isa(I->getOperand(0)) && + !isa(I->getOperand(1)) && + (!isa(I->getOperand(0)) || + !isa(I->getOperand(1)) || I->getOperand(0) == OpI); + } + return false; +} + int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I) { // Handle non-power-of-two vectors such as <3 x float> @@ -1834,6 +1866,26 @@ Opcode == Instruction::Store); return NumElem * Cost + SplitCost; } + } else if (I && isa(I) && I->hasOneUse()) { + // Check if the cost of the scalar load can be considered as 0. WE can + // consider it 0, if arithmetic|logic|compare|cast instruction allows to use + // memory address as one of its argument. + const Instruction *UI = I->user_back(); + if (isFreeOp(UI, UI->getType(), I)) + return 0; + switch (UI->getOpcode()) { + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + if (UI->hasOneUse() && + getCastInstrCost(UI->getOpcode(), UI->getType(), + UI->getOperand(0)->getType(), UI) == 0 && + isFreeOp(UI->user_back(), UI->getType(), UI)) + return 0; + break; + default: + break; + } } // Legalize the type. Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2236,15 +2236,25 @@ case Instruction::Load: { // Cost of wide load - cost of scalar loads. unsigned alignment = dyn_cast(VL0)->getAlignment(); + int ScalarLdCost = 0; if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, - alignment, 0, VL0); + for (const unsigned I : E->ReuseShuffleIndices) { + const auto *LD = cast(VL[I]); + ReuseShuffleCost -= + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, + LD->getPointerAddressSpace(), LD); + } + } else { + for (Value *V : VL) { + const auto *LD = cast(V); + ScalarLdCost += + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, + LD->getPointerAddressSpace(), LD); + } } - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, - VecTy, alignment, 0, VL0); + int VecLdCost = + TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, + cast(VL0)->getPointerAddressSpace()); return ReuseShuffleCost + VecLdCost - ScalarLdCost; } case Instruction::Store: { Index: test/Analysis/CostModel/X86/interleaved-load-i8.ll =================================================================== --- test/Analysis/CostModel/X86/interleaved-load-i8.ll +++ test/Analysis/CostModel/X86/interleaved-load-i8.ll @@ -6,7 +6,7 @@ ; Function Attrs: norecurse nounwind readonly uwtable define i32 @doit_stride3(i8* nocapture readonly %Ptr, i32 %Nels) { -;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8 +;CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %0 = load i8 ;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction: %0 = load i8 ;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8 ;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction: %0 = load i8 Index: test/Analysis/CostModel/X86/vectorized-loop.ll =================================================================== --- test/Analysis/CostModel/X86/vectorized-loop.ll +++ test/Analysis/CostModel/X86/vectorized-loop.ll @@ -53,12 +53,12 @@ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %end.idx.rnd.down, %middle.block ] %13 = add nsw i64 %indvars.iv, 2 %arrayidx = getelementptr inbounds i32, i32* %B, i64 %13 - ;CHECK: cost of 1 {{.*}} load + ;CHECK: cost of 0 {{.*}} load %14 = load i32, i32* %arrayidx, align 4 ;CHECK: cost of 1 {{.*}} mul %mul = mul nsw i32 %14, 5 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - ;CHECK: cost of 1 {{.*}} load + ;CHECK: cost of 0 {{.*}} load %15 = load i32, i32* %arrayidx2, align 4 %add3 = add nsw i32 %15, %mul store i32 %add3, i32* %arrayidx2, align 4 Index: test/Transforms/SLPVectorizer/X86/PR36280.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR36280.ll +++ test/Transforms/SLPVectorizer/X86/PR36280.ll @@ -5,15 +5,12 @@ ; CHECK-LABEL: @jacobi( ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr float, float* [[P:%.*]], i64 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr float, float* [[P]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[X:%.*]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Y:%.*]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 -; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[TMP6]], [[Z:%.*]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 -; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[TMP7]], [[ADD1]] +; CHECK-NEXT: [[P1:%.*]] = load float, float* [[GEP1]] +; CHECK-NEXT: [[P2:%.*]] = load float, float* [[GEP2]] +; CHECK-NEXT: [[MUL1:%.*]] = fmul float [[P1]], [[X:%.*]] +; CHECK-NEXT: [[MUL2:%.*]] = fmul float [[P2]], [[Y:%.*]] +; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[MUL1]], [[Z:%.*]] +; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[MUL2]], [[ADD1]] ; CHECK-NEXT: ret float [[ADD2]] ; %gep1 = getelementptr float, float* %p, i64 1 Index: test/Transforms/SLPVectorizer/X86/arith-add.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-add.ll +++ test/Transforms/SLPVectorizer/X86/arith-add.ll @@ -40,22 +40,38 @@ ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v8i64( -; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 +; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 +; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 +; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 +; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 +; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 +; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 +; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 +; SLM-NEXT: [[R0:%.*]] = add i64 [[A0]], [[B0]] +; SLM-NEXT: [[R1:%.*]] = add i64 [[A1]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = add i64 [[A2]], [[B2]] +; SLM-NEXT: [[R3:%.*]] = add i64 [[A3]], [[B3]] +; SLM-NEXT: [[R4:%.*]] = add i64 [[A4]], [[B4]] +; SLM-NEXT: [[R5:%.*]] = add i64 [[A5]], [[B5]] +; SLM-NEXT: [[R6:%.*]] = add i64 [[A6]], [[B6]] +; SLM-NEXT: [[R7:%.*]] = add i64 [[A7]], [[B7]] +; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @add_v8i64( Index: test/Transforms/SLPVectorizer/X86/arith-mul.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-mul.ll +++ test/Transforms/SLPVectorizer/X86/arith-mul.ll @@ -199,22 +199,70 @@ ; SSE-NEXT: ret void ; ; SLM-LABEL: @mul_v16i32( -; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 +; SLM-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 +; SLM-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 +; SLM-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 +; SLM-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 +; SLM-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 +; SLM-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 +; SLM-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 +; SLM-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 +; SLM-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 +; SLM-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 +; SLM-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 +; SLM-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 +; SLM-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 +; SLM-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 +; SLM-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 +; SLM-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 +; SLM-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 +; SLM-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 +; SLM-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 +; SLM-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 +; SLM-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 +; SLM-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 +; SLM-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 +; SLM-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 +; SLM-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 +; SLM-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 +; SLM-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 +; SLM-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 +; SLM-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 +; SLM-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 +; SLM-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 +; SLM-NEXT: [[R0:%.*]] = mul i32 [[A0]], [[B0]] +; SLM-NEXT: [[R1:%.*]] = mul i32 [[A1]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = mul i32 [[A2]], [[B2]] +; SLM-NEXT: [[R3:%.*]] = mul i32 [[A3]], [[B3]] +; SLM-NEXT: [[R4:%.*]] = mul i32 [[A4]], [[B4]] +; SLM-NEXT: [[R5:%.*]] = mul i32 [[A5]], [[B5]] +; SLM-NEXT: [[R6:%.*]] = mul i32 [[A6]], [[B6]] +; SLM-NEXT: [[R7:%.*]] = mul i32 [[A7]], [[B7]] +; SLM-NEXT: [[R8:%.*]] = mul i32 [[A8]], [[B8]] +; SLM-NEXT: [[R9:%.*]] = mul i32 [[A9]], [[B9]] +; SLM-NEXT: [[R10:%.*]] = mul i32 [[A10]], [[B10]] +; SLM-NEXT: [[R11:%.*]] = mul i32 [[A11]], [[B11]] +; SLM-NEXT: [[R12:%.*]] = mul i32 [[A12]], [[B12]] +; SLM-NEXT: [[R13:%.*]] = mul i32 [[A13]], [[B13]] +; SLM-NEXT: [[R14:%.*]] = mul i32 [[A14]], [[B14]] +; SLM-NEXT: [[R15:%.*]] = mul i32 [[A15]], [[B15]] +; SLM-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 +; SLM-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 +; SLM-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 +; SLM-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 +; SLM-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 +; SLM-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 +; SLM-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 +; SLM-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 +; SLM-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 +; SLM-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 +; SLM-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 +; SLM-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 +; SLM-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 +; SLM-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 +; SLM-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 +; SLM-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @mul_v16i32( Index: test/Transforms/SLPVectorizer/X86/arith-sub.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/arith-sub.ll +++ test/Transforms/SLPVectorizer/X86/arith-sub.ll @@ -40,22 +40,38 @@ ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v8i64( -; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 +; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 +; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 +; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 +; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 +; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 +; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 +; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 +; SLM-NEXT: [[R0:%.*]] = sub i64 [[A0]], [[B0]] +; SLM-NEXT: [[R1:%.*]] = sub i64 [[A1]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = sub i64 [[A2]], [[B2]] +; SLM-NEXT: [[R3:%.*]] = sub i64 [[A3]], [[B3]] +; SLM-NEXT: [[R4:%.*]] = sub i64 [[A4]], [[B4]] +; SLM-NEXT: [[R5:%.*]] = sub i64 [[A5]], [[B5]] +; SLM-NEXT: [[R6:%.*]] = sub i64 [[A6]], [[B6]] +; SLM-NEXT: [[R7:%.*]] = sub i64 [[A7]], [[B7]] +; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @sub_v8i64( Index: test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll +++ test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll @@ -37,25 +37,39 @@ define void @fn2(i32* %a, i32* %b, float* %c) { ; CHECK-LABEL: @fn2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 1 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 1 +; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[I0]], [[I1]] +; CHECK-NEXT: [[FP1:%.*]] = sitofp i32 [[ADD1]] to float +; CHECK-NEXT: [[CALL1:%.*]] = tail call float @llvm.powi.f32(float [[FP1]], i32 [[ADD1]]) #2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1 +; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1 +; CHECK-NEXT: [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[I2]], [[I3]] +; CHECK-NEXT: [[FP2:%.*]] = sitofp i32 [[ADD2]] to float +; CHECK-NEXT: [[CALL2:%.*]] = tail call float @llvm.powi.f32(float [[FP2]], i32 [[ADD1]]) #2 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2 +; CHECK-NEXT: [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2 +; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[I4]], [[I5]] +; CHECK-NEXT: [[FP3:%.*]] = sitofp i32 [[ADD3]] to float +; CHECK-NEXT: [[CALL3:%.*]] = tail call float @llvm.powi.f32(float [[FP3]], i32 [[ADD1]]) #2 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.powi.v4f32(<4 x float> [[TMP5]], i32 [[TMP6]]) -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1 +; CHECK-NEXT: [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[I6]], [[I7]] +; CHECK-NEXT: [[FP4:%.*]] = sitofp i32 [[ADD4]] to float +; CHECK-NEXT: [[CALL4:%.*]] = tail call float @llvm.powi.f32(float [[FP4]], i32 [[ADD1]]) #2 +; CHECK-NEXT: store float [[CALL1]], float* [[C:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C]], i32 1 +; CHECK-NEXT: store float [[CALL2]], float* [[ARRAYIDX8]], align 4 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2 +; CHECK-NEXT: store float [[CALL3]], float* [[ARRAYIDX9]], align 4 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[C]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 +; CHECK-NEXT: store float [[CALL4]], float* [[ARRAYIDX10]], align 4 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/horizontal.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/horizontal.ll +++ test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -1395,16 +1395,14 @@ ; ; STORE-LABEL: @i32_red_example4( ; STORE-NEXT: entry: -; STORE-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 -; STORE-NEXT: [[ADD:%.*]] = add nsw i32 undef, undef -; STORE-NEXT: [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]] -; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> -; STORE-NEXT: [[BIN_RDX:%.*]] = add nsw <4 x i32> [[TMP0]], [[RDX_SHUF]] -; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> -; STORE-NEXT: [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; STORE-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 -; STORE-NEXT: [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]] -; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 +; STORE-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16 +; STORE-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4 +; STORE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; STORE-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8 +; STORE-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]] +; STORE-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4 +; STORE-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]] +; STORE-NEXT: store i32 [[ADD_2]], i32* [[RES:%.*]], align 16 ; STORE-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll +++ test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll @@ -44,30 +44,29 @@ ; ; SSE2-LABEL: @test( ; SSE2-NEXT: entry: -; SSE2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; SSE2-NEXT: [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4 +; SSE2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; SSE2-NEXT: [[MUL_18:%.*]] = add i32 [[TMP1]], [[TMP0]] ; SSE2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; SSE2-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; SSE2-NEXT: [[MUL_29:%.*]] = add i32 [[TMP2]], [[MUL_18]] ; SSE2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; SSE2-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; SSE2-NEXT: [[MUL_310:%.*]] = add i32 [[TMP3]], [[MUL_29]] ; SSE2-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; SSE2-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4 +; SSE2-NEXT: [[MUL_411:%.*]] = add i32 [[TMP4]], [[MUL_310]] ; SSE2-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; SSE2-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4 +; SSE2-NEXT: [[MUL_512:%.*]] = add i32 [[TMP5]], [[MUL_411]] ; SSE2-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; SSE2-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4 +; SSE2-NEXT: [[MUL_613:%.*]] = add i32 [[TMP6]], [[MUL_512]] ; SSE2-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; SSE2-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* -; SSE2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; SSE2-NEXT: [[MUL_18:%.*]] = add i32 undef, undef -; SSE2-NEXT: [[MUL_29:%.*]] = add i32 undef, [[MUL_18]] -; SSE2-NEXT: [[MUL_310:%.*]] = add i32 undef, [[MUL_29]] -; SSE2-NEXT: [[MUL_411:%.*]] = add i32 undef, [[MUL_310]] -; SSE2-NEXT: [[MUL_512:%.*]] = add i32 undef, [[MUL_411]] -; SSE2-NEXT: [[MUL_613:%.*]] = add i32 undef, [[MUL_512]] -; SSE2-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> -; SSE2-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]] -; SSE2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; SSE2-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; SSE2-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; SSE2-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; SSE2-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; SSE2-NEXT: [[MUL_714:%.*]] = add i32 undef, [[MUL_613]] -; SSE2-NEXT: ret i32 [[TMP2]] +; SSE2-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4 +; SSE2-NEXT: [[MUL_714:%.*]] = add i32 [[TMP7]], [[MUL_613]] +; SSE2-NEXT: ret i32 [[MUL_714]] ; entry: %0 = load i32, i32* %p, align 4