Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1809,6 +1809,38 @@
   return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
 }
 
+static bool isFreeOp(const Instruction *I, const Type *Ty,
+                     const Instruction *OpI) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return !isa<LoadInst>(I->getOperand(0)) ||
+           !isa<LoadInst>(I->getOperand(1)) || I->getOperand(0) == OpI;
+  case Instruction::Mul:
+    return (!isa<LoadInst>(I->getOperand(0)) ||
+            !isa<LoadInst>(I->getOperand(1)) || I->getOperand(0) == OpI) &&
+           Ty->getScalarSizeInBits() <= 64;
+  case Instruction::Shl:
+  case Instruction::AShr:
+  case Instruction::LShr:
+    return I->getOperand(0) == I;
+  case Instruction::FAdd:
+  case Instruction::FSub:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+    return !isa<Constant>(I->getOperand(0)) &&
+           !isa<Constant>(I->getOperand(1)) &&
+           (!isa<LoadInst>(I->getOperand(0)) ||
+            !isa<LoadInst>(I->getOperand(1)) || I->getOperand(0) == OpI);
+  }
+  return false;
+}
+
 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                 unsigned AddressSpace, const Instruction *I) {
   // Handle non-power-of-two vectors such as <3 x float>
@@ -1834,6 +1866,26 @@
                                                Opcode == Instruction::Store);
       return NumElem * Cost + SplitCost;
     }
+  } else if (I && isa<LoadInst>(I) && I->hasOneUse()) {
+    // Check if the cost of the scalar load can be considered as 0. WE can
+    // consider it 0, if arithmetic|logic|compare|cast instruction allows to use
+    // memory address as one of its argument.
+    const Instruction *UI = I->user_back();
+    if (isFreeOp(UI, UI->getType(), I))
+      return 0;
+    switch (UI->getOpcode()) {
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      if (UI->hasOneUse() &&
+          getCastInstrCost(UI->getOpcode(), UI->getType(),
+                           UI->getOperand(0)->getType(), UI) == 0 &&
+          isFreeOp(UI->user_back(), UI->getType(), UI))
+        return 0;
+      break;
+    default:
+      break;
+    }
   }
 
   // Legalize the type.
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2236,15 +2236,25 @@
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
       unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
+      int ScalarLdCost = 0;
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
-                            TTI->getMemoryOpCost(Instruction::Load, ScalarTy,
-                                                 alignment, 0, VL0);
+        for (const unsigned I : E->ReuseShuffleIndices) {
+          const auto *LD = cast<LoadInst>(VL[I]);
+          ReuseShuffleCost -=
+              TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment,
+                                   LD->getPointerAddressSpace(), LD);
+        }
+      } else {
+        for (Value *V : VL) {
+          const auto *LD = cast<LoadInst>(V);
+          ScalarLdCost +=
+              TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment,
+                                   LD->getPointerAddressSpace(), LD);
+        }
       }
-      int ScalarLdCost = VecTy->getNumElements() *
-          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
-      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
-                                           VecTy, alignment, 0, VL0);
+      int VecLdCost =
+          TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment,
+                               cast<LoadInst>(VL0)->getPointerAddressSpace());
       return ReuseShuffleCost + VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
Index: test/Analysis/CostModel/X86/interleaved-load-i8.ll
===================================================================
--- test/Analysis/CostModel/X86/interleaved-load-i8.ll
+++ test/Analysis/CostModel/X86/interleaved-load-i8.ll
@@ -6,7 +6,7 @@
 
 ; Function Attrs: norecurse nounwind readonly uwtable
 define i32 @doit_stride3(i8* nocapture readonly %Ptr, i32 %Nels)  {
-;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %0 = load i8
 ;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction:   %0 = load i8
 ;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load i8
 ;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction:   %0 = load i8
Index: test/Analysis/CostModel/X86/vectorized-loop.ll
===================================================================
--- test/Analysis/CostModel/X86/vectorized-loop.ll
+++ test/Analysis/CostModel/X86/vectorized-loop.ll
@@ -53,12 +53,12 @@
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %end.idx.rnd.down, %middle.block ]
   %13 = add nsw i64 %indvars.iv, 2
   %arrayidx = getelementptr inbounds i32, i32* %B, i64 %13
-  ;CHECK: cost of 1 {{.*}} load
+  ;CHECK: cost of 0 {{.*}} load
   %14 = load i32, i32* %arrayidx, align 4
   ;CHECK: cost of 1 {{.*}} mul
   %mul = mul nsw i32 %14, 5
   %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  ;CHECK: cost of 1 {{.*}} load
+  ;CHECK: cost of 0 {{.*}} load
   %15 = load i32, i32* %arrayidx2, align 4
   %add3 = add nsw i32 %15, %mul
   store i32 %add3, i32* %arrayidx2, align 4
Index: test/Transforms/SLPVectorizer/X86/PR36280.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/PR36280.ll
+++ test/Transforms/SLPVectorizer/X86/PR36280.ll
@@ -5,15 +5,12 @@
 ; CHECK-LABEL: @jacobi(
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr float, float* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr float, float* [[P]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[X:%.*]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[TMP6]], [[Z:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
-; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[TMP7]], [[ADD1]]
+; CHECK-NEXT:    [[P1:%.*]] = load float, float* [[GEP1]]
+; CHECK-NEXT:    [[P2:%.*]] = load float, float* [[GEP2]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul float [[P1]], [[X:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul float [[P2]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[MUL1]], [[Z:%.*]]
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[MUL2]], [[ADD1]]
 ; CHECK-NEXT:    ret float [[ADD2]]
 ;
   %gep1 = getelementptr float, float* %p, i64 1
Index: test/Transforms/SLPVectorizer/X86/arith-add.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/arith-add.ll
+++ test/Transforms/SLPVectorizer/X86/arith-add.ll
@@ -40,22 +40,38 @@
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v8i64(
-; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[R0:%.*]] = add i64 [[A0]], [[B0]]
+; SLM-NEXT:    [[R1:%.*]] = add i64 [[A1]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = add i64 [[A2]], [[B2]]
+; SLM-NEXT:    [[R3:%.*]] = add i64 [[A3]], [[B3]]
+; SLM-NEXT:    [[R4:%.*]] = add i64 [[A4]], [[B4]]
+; SLM-NEXT:    [[R5:%.*]] = add i64 [[A5]], [[B5]]
+; SLM-NEXT:    [[R6:%.*]] = add i64 [[A6]], [[B6]]
+; SLM-NEXT:    [[R7:%.*]] = add i64 [[A7]], [[B7]]
+; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v8i64(
Index: test/Transforms/SLPVectorizer/X86/arith-mul.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/arith-mul.ll
+++ test/Transforms/SLPVectorizer/X86/arith-mul.ll
@@ -199,22 +199,70 @@
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @mul_v16i32(
-; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SLM-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SLM-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SLM-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SLM-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SLM-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SLM-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SLM-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SLM-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SLM-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SLM-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SLM-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SLM-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SLM-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SLM-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SLM-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SLM-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SLM-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SLM-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SLM-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SLM-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SLM-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SLM-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SLM-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SLM-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SLM-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SLM-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SLM-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SLM-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SLM-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SLM-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SLM-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SLM-NEXT:    [[R0:%.*]] = mul i32 [[A0]], [[B0]]
+; SLM-NEXT:    [[R1:%.*]] = mul i32 [[A1]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = mul i32 [[A2]], [[B2]]
+; SLM-NEXT:    [[R3:%.*]] = mul i32 [[A3]], [[B3]]
+; SLM-NEXT:    [[R4:%.*]] = mul i32 [[A4]], [[B4]]
+; SLM-NEXT:    [[R5:%.*]] = mul i32 [[A5]], [[B5]]
+; SLM-NEXT:    [[R6:%.*]] = mul i32 [[A6]], [[B6]]
+; SLM-NEXT:    [[R7:%.*]] = mul i32 [[A7]], [[B7]]
+; SLM-NEXT:    [[R8:%.*]] = mul i32 [[A8]], [[B8]]
+; SLM-NEXT:    [[R9:%.*]] = mul i32 [[A9]], [[B9]]
+; SLM-NEXT:    [[R10:%.*]] = mul i32 [[A10]], [[B10]]
+; SLM-NEXT:    [[R11:%.*]] = mul i32 [[A11]], [[B11]]
+; SLM-NEXT:    [[R12:%.*]] = mul i32 [[A12]], [[B12]]
+; SLM-NEXT:    [[R13:%.*]] = mul i32 [[A13]], [[B13]]
+; SLM-NEXT:    [[R14:%.*]] = mul i32 [[A14]], [[B14]]
+; SLM-NEXT:    [[R15:%.*]] = mul i32 [[A15]], [[B15]]
+; SLM-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SLM-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SLM-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SLM-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SLM-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SLM-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SLM-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SLM-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SLM-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SLM-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SLM-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SLM-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SLM-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SLM-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SLM-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SLM-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @mul_v16i32(
Index: test/Transforms/SLPVectorizer/X86/arith-sub.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/arith-sub.ll
+++ test/Transforms/SLPVectorizer/X86/arith-sub.ll
@@ -40,22 +40,38 @@
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v8i64(
-; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[R0:%.*]] = sub i64 [[A0]], [[B0]]
+; SLM-NEXT:    [[R1:%.*]] = sub i64 [[A1]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = sub i64 [[A2]], [[B2]]
+; SLM-NEXT:    [[R3:%.*]] = sub i64 [[A3]], [[B3]]
+; SLM-NEXT:    [[R4:%.*]] = sub i64 [[A4]], [[B4]]
+; SLM-NEXT:    [[R5:%.*]] = sub i64 [[A5]], [[B5]]
+; SLM-NEXT:    [[R6:%.*]] = sub i64 [[A6]], [[B6]]
+; SLM-NEXT:    [[R7:%.*]] = sub i64 [[A7]], [[B7]]
+; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v8i64(
Index: test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
+++ test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
@@ -37,25 +37,39 @@
 define void @fn2(i32* %a, i32* %b, float* %c) {
 ; CHECK-LABEL: @fn2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 1
+; CHECK-NEXT:    [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[I0]], [[I1]]
+; CHECK-NEXT:    [[FP1:%.*]] = sitofp i32 [[ADD1]] to float
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call float @llvm.powi.f32(float [[FP1]], i32 [[ADD1]]) #2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1
+; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1
+; CHECK-NEXT:    [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[I2]], [[I3]]
+; CHECK-NEXT:    [[FP2:%.*]] = sitofp i32 [[ADD2]] to float
+; CHECK-NEXT:    [[CALL2:%.*]] = tail call float @llvm.powi.f32(float [[FP2]], i32 [[ADD1]]) #2
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2
+; CHECK-NEXT:    [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2
+; CHECK-NEXT:    [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[I4]], [[I5]]
+; CHECK-NEXT:    [[FP3:%.*]] = sitofp i32 [[ADD3]] to float
+; CHECK-NEXT:    [[CALL3:%.*]] = tail call float @llvm.powi.f32(float [[FP3]], i32 [[ADD1]]) #2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.powi.v4f32(<4 x float> [[TMP5]], i32 [[TMP6]])
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1
+; CHECK-NEXT:    [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[ADD4:%.*]] = add i32 [[I6]], [[I7]]
+; CHECK-NEXT:    [[FP4:%.*]] = sitofp i32 [[ADD4]] to float
+; CHECK-NEXT:    [[CALL4:%.*]] = tail call float @llvm.powi.f32(float [[FP4]], i32 [[ADD1]]) #2
+; CHECK-NEXT:    store float [[CALL1]], float* [[C:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C]], i32 1
+; CHECK-NEXT:    store float [[CALL2]], float* [[ARRAYIDX8]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2
+; CHECK-NEXT:    store float [[CALL3]], float* [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[C]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    store float [[CALL4]], float* [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/horizontal.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -1395,16 +1395,14 @@
 ;
 ; STORE-LABEL: @i32_red_example4(
 ; STORE-NEXT:  entry:
-; STORE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
-; STORE-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
-; STORE-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
-; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <4 x i32> [[TMP0]], [[RDX_SHUF]]
-; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; STORE-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; STORE-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
-; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
+; STORE-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+; STORE-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+; STORE-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; STORE-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+; STORE-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; STORE-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+; STORE-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
+; STORE-NEXT:    store i32 [[ADD_2]], i32* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -44,30 +44,29 @@
 ;
 ; SSE2-LABEL: @test(
 ; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = add i32 [[TMP1]], [[TMP0]]
 ; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; SSE2-NEXT:    [[MUL_29:%.*]] = add i32 [[TMP2]], [[MUL_18]]
 ; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; SSE2-NEXT:    [[MUL_310:%.*]] = add i32 [[TMP3]], [[MUL_29]]
 ; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; SSE2-NEXT:    [[MUL_411:%.*]] = add i32 [[TMP4]], [[MUL_310]]
 ; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; SSE2-NEXT:    [[MUL_512:%.*]] = add i32 [[TMP5]], [[MUL_411]]
 ; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; SSE2-NEXT:    [[MUL_613:%.*]] = add i32 [[TMP6]], [[MUL_512]]
 ; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; SSE2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; SSE2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; SSE2-NEXT:    [[MUL_18:%.*]] = add i32 undef, undef
-; SSE2-NEXT:    [[MUL_29:%.*]] = add i32 undef, [[MUL_18]]
-; SSE2-NEXT:    [[MUL_310:%.*]] = add i32 undef, [[MUL_29]]
-; SSE2-NEXT:    [[MUL_411:%.*]] = add i32 undef, [[MUL_310]]
-; SSE2-NEXT:    [[MUL_512:%.*]] = add i32 undef, [[MUL_411]]
-; SSE2-NEXT:    [[MUL_613:%.*]] = add i32 undef, [[MUL_512]]
-; SSE2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; SSE2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; SSE2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; SSE2-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
-; SSE2-NEXT:    ret i32 [[TMP2]]
+; SSE2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; SSE2-NEXT:    [[MUL_714:%.*]] = add i32 [[TMP7]], [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[MUL_714]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4