Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1811,6 +1811,66 @@
   return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
 }
 
+static bool isLoadOrFreeCast(X86TTIImpl *TTI, const Value *V) {
+  if (isa<LoadInst>(V))
+    return true;
+  const auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+  switch (I->getOpcode()) {
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // Check if the cast of the load is free if the cast operation is free.
+    return I->hasOneUse() &&
+           TTI->getCastInstrCost(I->getOpcode(), I->getType(),
+                                 I->getOperand(0)->getType(), I) == 0 &&
+           isa<LoadInst>(I->getOperand(0));
+  default:
+    break;
+  }
+  return false;
+}
+
+static bool isFreeOp(X86TTIImpl *TTI, const Instruction *I, const Type *Ty,
+                     const Instruction *OpI) {
+  switch (I->getOpcode()) {
+  case Instruction::Sub:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FSub:
+  case Instruction::FDiv:
+    return I->getOperand(1) == OpI;
+  case Instruction::Shl:
+  case Instruction::AShr:
+  case Instruction::LShr:
+    return I->getOperand(0) == OpI && Ty->getScalarSizeInBits() <= 64;
+  case Instruction::Add:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return !isLoadOrFreeCast(TTI, I->getOperand(0)) ||
+           !isLoadOrFreeCast(TTI, I->getOperand(1)) || I->getOperand(1) == OpI;
+  case Instruction::Mul:
+    // The load instruction is folded only if the size of the data is i8, i16,
+    // i32 or i64.
+    return (!isLoadOrFreeCast(TTI, I->getOperand(0)) ||
+            !isLoadOrFreeCast(TTI, I->getOperand(1)) ||
+            I->getOperand(1) == OpI) &&
+           Ty->getScalarSizeInBits() <= 64;
+  case Instruction::FAdd:
+  case Instruction::FMul:
+    return !isa<Constant>(I->getOperand(0)) &&
+           !isa<Constant>(I->getOperand(1)) &&
+           (!isLoadOrFreeCast(TTI, I->getOperand(0)) ||
+            !isLoadOrFreeCast(TTI, I->getOperand(1)) ||
+            I->getOperand(1) == OpI);
+  }
+  return false;
+}
+
 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                 unsigned AddressSpace, const Instruction *I) {
   // Handle non-power-of-two vectors such as <3 x float>
@@ -1836,6 +1896,27 @@
                                                Opcode == Instruction::Store);
       return NumElem * Cost + SplitCost;
     }
+  } else if (I && isa<LoadInst>(I) && I->hasOneUse()) {
+    // Check if the cost of the scalar load can be considered as 0. WE can
+    // consider it 0, if arithmetic|logic|compare|cast instruction allows to use
+    // memory address as one of its argument.
+    const Instruction *UI = I->user_back();
+    if (isFreeOp(this, UI, UI->getType(), I))
+      return 0;
+    switch (UI->getOpcode()) {
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // Check if the cast of the load is free if the cast operation is free.
+      if (UI->hasOneUse() &&
+          getCastInstrCost(UI->getOpcode(), UI->getType(),
+                           UI->getOperand(0)->getType(), UI) == 0 &&
+          isFreeOp(this, UI->user_back(), UI->getType(), UI))
+        return 0;
+      break;
+    default:
+      break;
+    }
   }
 
   // Legalize the type.
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2236,16 +2236,26 @@
     }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
-      unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
+      const auto *LI = cast<LoadInst>(VL0);
+      unsigned alignment = LI->getAlignment();
+      int ScalarLdCost = 0;
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
-                            TTI->getMemoryOpCost(Instruction::Load, ScalarTy,
-                                                 alignment, 0, VL0);
+        for (const unsigned I : E->ReuseShuffleIndices) {
+          const auto *LD = cast<LoadInst>(VL[I]);
+          ReuseShuffleCost -=
+              TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment,
+                                   LD->getPointerAddressSpace(), LD);
+        }
+      } else {
+        for (Value *V : VL) {
+          const auto *LD = cast<LoadInst>(V);
+          ScalarLdCost +=
+              TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment,
+                                   LD->getPointerAddressSpace(), LD);
+        }
       }
-      int ScalarLdCost = VecTy->getNumElements() *
-          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
-      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
-                                           VecTy, alignment, 0, VL0);
+      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment,
+                                           LI->getPointerAddressSpace());
       if (!isConsecutiveAccess(VL[0], VL[1], *DL, *SE)) {
         VecLdCost += TTI->getShuffleCost(
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
Index: test/Analysis/CostModel/X86/interleave-load-i32.ll
===================================================================
--- test/Analysis/CostModel/X86/interleave-load-i32.ll
+++ test/Analysis/CostModel/X86/interleave-load-i32.ll
@@ -10,7 +10,7 @@
 ; Function Attrs: nounwind uwtable
 define void @load_i32_interleave4() {
 ;CHECK-LABEL: load_i32_interleave4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 0 for VF 1 For instruction:   %0 = load
 ;CHECK: Found an estimated cost of 5 for VF 2 For instruction:   %0 = load
 ;CHECK: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load
 ;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %0 = load
@@ -46,7 +46,7 @@
 
 define void @load_i32_interleave5() {
 ;CHECK-LABEL: load_i32_interleave5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 0 for VF 1 For instruction:   %0 = load
 ;CHECK: Found an estimated cost of 6 for VF 2 For instruction:   %0 = load
 ;CHECK: Found an estimated cost of 9 for VF 4 For instruction:   %0 = load
 ;CHECK: Found an estimated cost of 18 for VF 8 For instruction:   %0 = load
Index: test/Analysis/CostModel/X86/interleaved-load-i8.ll
===================================================================
--- test/Analysis/CostModel/X86/interleaved-load-i8.ll
+++ test/Analysis/CostModel/X86/interleaved-load-i8.ll
@@ -6,7 +6,7 @@
 
 ; Function Attrs: norecurse nounwind readonly uwtable
 define i32 @doit_stride3(i8* nocapture readonly %Ptr, i32 %Nels)  {
-;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %0 = load i8
 ;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction:   %0 = load i8
 ;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load i8
 ;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction:   %0 = load i8
Index: test/Analysis/CostModel/X86/vectorized-loop.ll
===================================================================
--- test/Analysis/CostModel/X86/vectorized-loop.ll
+++ test/Analysis/CostModel/X86/vectorized-loop.ll
@@ -53,12 +53,12 @@
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %end.idx.rnd.down, %middle.block ]
   %13 = add nsw i64 %indvars.iv, 2
   %arrayidx = getelementptr inbounds i32, i32* %B, i64 %13
-  ;CHECK: cost of 1 {{.*}} load
+  ;CHECK: cost of 0 {{.*}} load
   %14 = load i32, i32* %arrayidx, align 4
   ;CHECK: cost of 1 {{.*}} mul
   %mul = mul nsw i32 %14, 5
   %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  ;CHECK: cost of 1 {{.*}} load
+  ;CHECK: cost of 0 {{.*}} load
   %15 = load i32, i32* %arrayidx2, align 4
   %add3 = add nsw i32 %15, %mul
   store i32 %add3, i32* %arrayidx2, align 4
Index: test/Transforms/SLPVectorizer/X86/PR35777.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/PR35777.ll
+++ test/Transforms/SLPVectorizer/X86/PR35777.ll
@@ -6,20 +6,24 @@
 define { i64, i64 } @patatino(double %arg) {
 ; CHECK-LABEL: @patatino(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([6 x double]* @global to <2 x double>*), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[ARG:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP:%.*]] = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul double [[TMP1]], [[ARG:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fptosi double [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 1), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 3), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul double [[TMP9]], [[ARG]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd double [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 5), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd double [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fptosi double [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP15]], 1
 ; CHECK-NEXT:    ret { i64, i64 } [[TMP17]]
 ;
 bb:
Index: test/Transforms/SLPVectorizer/X86/addsub.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/addsub.ll
+++ test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -311,22 +311,22 @@
 
 define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %d) {
 ; CHECK-LABEL: @reorder_alt_rightsubTree(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[D]] to <2 x double>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>*
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[B]] to <2 x double>*
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fsub <2 x double> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[C]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP13]], <2 x double>* [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[B:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[D:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    store double [[TMP5]], double* [[C:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[D]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, double* [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load double, double* [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd double [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd double [[TMP7]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; CHECK-NEXT:    store double [[TMP13]], double* [[TMP14]]
 ; CHECK-NEXT:    ret void
 ;
   %1 = load double, double* %a
Index: test/Transforms/SLPVectorizer/X86/arith-add.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/arith-add.ll
+++ test/Transforms/SLPVectorizer/X86/arith-add.ll
@@ -40,22 +40,38 @@
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v8i64(
-; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[R0:%.*]] = add i64 [[A0]], [[B0]]
+; SLM-NEXT:    [[R1:%.*]] = add i64 [[A1]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = add i64 [[A2]], [[B2]]
+; SLM-NEXT:    [[R3:%.*]] = add i64 [[A3]], [[B3]]
+; SLM-NEXT:    [[R4:%.*]] = add i64 [[A4]], [[B4]]
+; SLM-NEXT:    [[R5:%.*]] = add i64 [[A5]], [[B5]]
+; SLM-NEXT:    [[R6:%.*]] = add i64 [[A6]], [[B6]]
+; SLM-NEXT:    [[R7:%.*]] = add i64 [[A7]], [[B7]]
+; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v8i64(
Index: test/Transforms/SLPVectorizer/X86/arith-mul.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/arith-mul.ll
+++ test/Transforms/SLPVectorizer/X86/arith-mul.ll
@@ -199,22 +199,70 @@
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @mul_v16i32(
-; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SLM-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SLM-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SLM-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SLM-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SLM-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SLM-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SLM-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SLM-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SLM-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SLM-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SLM-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SLM-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SLM-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SLM-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SLM-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SLM-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SLM-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SLM-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SLM-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SLM-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SLM-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SLM-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SLM-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SLM-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SLM-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SLM-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SLM-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SLM-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SLM-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SLM-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SLM-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SLM-NEXT:    [[R0:%.*]] = mul i32 [[A0]], [[B0]]
+; SLM-NEXT:    [[R1:%.*]] = mul i32 [[A1]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = mul i32 [[A2]], [[B2]]
+; SLM-NEXT:    [[R3:%.*]] = mul i32 [[A3]], [[B3]]
+; SLM-NEXT:    [[R4:%.*]] = mul i32 [[A4]], [[B4]]
+; SLM-NEXT:    [[R5:%.*]] = mul i32 [[A5]], [[B5]]
+; SLM-NEXT:    [[R6:%.*]] = mul i32 [[A6]], [[B6]]
+; SLM-NEXT:    [[R7:%.*]] = mul i32 [[A7]], [[B7]]
+; SLM-NEXT:    [[R8:%.*]] = mul i32 [[A8]], [[B8]]
+; SLM-NEXT:    [[R9:%.*]] = mul i32 [[A9]], [[B9]]
+; SLM-NEXT:    [[R10:%.*]] = mul i32 [[A10]], [[B10]]
+; SLM-NEXT:    [[R11:%.*]] = mul i32 [[A11]], [[B11]]
+; SLM-NEXT:    [[R12:%.*]] = mul i32 [[A12]], [[B12]]
+; SLM-NEXT:    [[R13:%.*]] = mul i32 [[A13]], [[B13]]
+; SLM-NEXT:    [[R14:%.*]] = mul i32 [[A14]], [[B14]]
+; SLM-NEXT:    [[R15:%.*]] = mul i32 [[A15]], [[B15]]
+; SLM-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SLM-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SLM-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SLM-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SLM-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SLM-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SLM-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SLM-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SLM-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SLM-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SLM-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SLM-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SLM-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SLM-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SLM-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SLM-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @mul_v16i32(
Index: test/Transforms/SLPVectorizer/X86/arith-sub.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/arith-sub.ll
+++ test/Transforms/SLPVectorizer/X86/arith-sub.ll
@@ -40,22 +40,38 @@
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v8i64(
-; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[R0:%.*]] = sub i64 [[A0]], [[B0]]
+; SLM-NEXT:    [[R1:%.*]] = sub i64 [[A1]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = sub i64 [[A2]], [[B2]]
+; SLM-NEXT:    [[R3:%.*]] = sub i64 [[A3]], [[B3]]
+; SLM-NEXT:    [[R4:%.*]] = sub i64 [[A4]], [[B4]]
+; SLM-NEXT:    [[R5:%.*]] = sub i64 [[A5]], [[B5]]
+; SLM-NEXT:    [[R6:%.*]] = sub i64 [[A6]], [[B6]]
+; SLM-NEXT:    [[R7:%.*]] = sub i64 [[A7]], [[B7]]
+; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v8i64(
Index: test/Transforms/SLPVectorizer/X86/compare-reduce.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/compare-reduce.ll
+++ test/Transforms/SLPVectorizer/X86/compare-reduce.ll
@@ -10,21 +10,22 @@
 ; CHECK-LABEL: @reduce_compare(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[CONV]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[CONV]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> <double 7.000000e+00, double 4.000000e+00>, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> <double 5.000000e+00, double 9.000000e+00>, [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
-; CHECK-NEXT:    [[CMP11:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul double [[CONV]], [[TMP1]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul double [[MUL1]], 7.000000e+00
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[MUL2]], 5.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[MUL8:%.*]] = fmul double [[CONV]], [[TMP3]]
+; CHECK-NEXT:    [[MUL9:%.*]] = fmul double [[MUL8]], 4.000000e+00
+; CHECK-NEXT:    [[ADD10:%.*]] = fadd double [[MUL9]], 9.000000e+00
+; CHECK-NEXT:    [[CMP11:%.*]] = fcmp ogt double [[ADD]], [[ADD10]]
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i64 0, i64 0))
Index: test/Transforms/SLPVectorizer/X86/cse.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/cse.ll
+++ test/Transforms/SLPVectorizer/X86/cse.ll
@@ -253,28 +253,27 @@
 define i32 @partial_mrg(double* nocapture %A, i32 %n) {
 ; CHECK-LABEL: @partial_mrg(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[A:%.*]], align 8
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[CONV]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[CONV]], [[TMP0]]
+; CHECK-NEXT:    store double [[MUL]], double* [[A]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[MUL4:%.*]] = fmul double [[CONV]], [[TMP1]]
+; CHECK-NEXT:    store double [[MUL4]], double* [[ARRAYIDX2]], align 8
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[N]], 4
 ; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX7]], align 8
+; CHECK-NEXT:    [[MUL9:%.*]] = fmul double [[CONV]], [[TMP2]]
+; CHECK-NEXT:    store double [[MUL9]], double* [[ARRAYIDX7]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds double, double* [[A]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* [[ARRAYIDX11]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[N]], 4
 ; CHECK-NEXT:    [[CONV12:%.*]] = sitofp i32 [[ADD]] to double
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV12]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
+; CHECK-NEXT:    [[MUL13:%.*]] = fmul double [[CONV12]], [[TMP3]]
+; CHECK-NEXT:    store double [[MUL13]], double* [[ARRAYIDX11]], align 8
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
 ; CHECK-NEXT:    ret i32 0
Index: test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
+++ test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
@@ -37,25 +37,39 @@
 define void @fn2(i32* %a, i32* %b, float* %c) {
 ; CHECK-LABEL: @fn2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 1
+; CHECK-NEXT:    [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[I0]], [[I1]]
+; CHECK-NEXT:    [[FP1:%.*]] = sitofp i32 [[ADD1]] to float
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call float @llvm.powi.f32(float [[FP1]], i32 [[ADD1]]) #2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1
+; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1
+; CHECK-NEXT:    [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[I2]], [[I3]]
+; CHECK-NEXT:    [[FP2:%.*]] = sitofp i32 [[ADD2]] to float
+; CHECK-NEXT:    [[CALL2:%.*]] = tail call float @llvm.powi.f32(float [[FP2]], i32 [[ADD1]]) #2
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2
+; CHECK-NEXT:    [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2
+; CHECK-NEXT:    [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[I4]], [[I5]]
+; CHECK-NEXT:    [[FP3:%.*]] = sitofp i32 [[ADD3]] to float
+; CHECK-NEXT:    [[CALL3:%.*]] = tail call float @llvm.powi.f32(float [[FP3]], i32 [[ADD1]]) #2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.powi.v4f32(<4 x float> [[TMP5]], i32 [[TMP6]])
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1
+; CHECK-NEXT:    [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[ADD4:%.*]] = add i32 [[I6]], [[I7]]
+; CHECK-NEXT:    [[FP4:%.*]] = sitofp i32 [[ADD4]] to float
+; CHECK-NEXT:    [[CALL4:%.*]] = tail call float @llvm.powi.f32(float [[FP4]], i32 [[ADD1]]) #2
+; CHECK-NEXT:    store float [[CALL1]], float* [[C:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C]], i32 1
+; CHECK-NEXT:    store float [[CALL2]], float* [[ARRAYIDX8]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2
+; CHECK-NEXT:    store float [[CALL3]], float* [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[C]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    store float [[CALL4]], float* [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/funclet.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/funclet.ll
+++ test/Transforms/SLPVectorizer/X86/funclet.ll
@@ -12,17 +12,19 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = catchswitch within none [label %catch] unwind to caller
 ; CHECK:       catch:
 ; CHECK-NEXT:    [[TMP1:%.*]] = catchpad within [[TMP0]] [i8* null, i32 64, i8* null]
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[A]] to <2 x double>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[B]] to <2 x double>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP6]]) [ "funclet"(token [[TMP1]]) ]
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[C]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
+; CHECK-NEXT:    [[I0:%.*]] = load double, double* [[A:%.*]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = load double, double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call double @floor(double [[MUL]]) #1 [ "funclet"(token [[TMP1]]) ]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; CHECK-NEXT:    [[CALL5:%.*]] = tail call double @floor(double [[MUL5]]) #1 [ "funclet"(token [[TMP1]]) ]
+; CHECK-NEXT:    store double [[CALL]], double* [[C:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; CHECK-NEXT:    store double [[CALL5]], double* [[ARRAYIDX5]], align 8
 ; CHECK-NEXT:    catchret from [[TMP1]] to label [[TRY_CONT:%.*]]
 ; CHECK:       try.cont:
 ; CHECK-NEXT:    ret void
Index: test/Transforms/SLPVectorizer/X86/horizontal-list.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -13,25 +13,27 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
 ; CHECK-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]]
-; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]]
-; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]]
-; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]]
-; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]]
+; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float [[MUL4]], [[ADD7]]
+; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD19]]
+; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD19_1]]
+; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD19_2]]
 ; CHECK-NEXT:    store float [[ADD19_3]], float* @res, align 4
 ; CHECK-NEXT:    ret float [[ADD19_3]]
 ;
@@ -1060,15 +1062,17 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
 ; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP2]], [[ADD_1]]
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP3]], [[ADD_2]]
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float [[TMP4]], [[ADD_3]]
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
-; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
-; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
-; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float [[TMP5]], [[ADD_4]]
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
 ; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
@@ -1077,8 +1081,8 @@
 ; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
 ; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
 ; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
 ; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
 ; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
 ; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
@@ -1103,8 +1107,8 @@
 ; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
 ; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
 ; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x float>, <16 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
 ; CHECK-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
 ; CHECK-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
@@ -1120,33 +1124,31 @@
 ; CHECK-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
 ; CHECK-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
 ; CHECK-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP9]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
-; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP5]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP7]], [[RDX_SHUF7]]
 ; CHECK-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <8 x float> [[BIN_RDX8]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <8 x float> [[BIN_RDX8]], [[RDX_SHUF9]]
 ; CHECK-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]]
-; CHECK-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0
-; CHECK-NEXT:    [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[OP_RDX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast float [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast float [[TMP13]], [[TMP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd fast float [[TMP14]], [[TMP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fadd fast float [[TMP15]], [[TMP1]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd fast float [[TMP16]], [[TMP0]]
 ; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
-; CHECK-NEXT:    ret float [[TMP12]]
+; CHECK-NEXT:    ret float [[TMP17]]
 ;
 ; THRESHOLD-LABEL: @loadadd31(
 ; THRESHOLD-NEXT:  entry:
Index: test/Transforms/SLPVectorizer/X86/horizontal.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -948,23 +948,22 @@
 ; STORE-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; STORE-NEXT:    br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
 ; STORE:       for.body.lr.ph:
-; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
-; STORE-NEXT:    [[TMP0:%.*]] = bitcast double* [[B]] to <2 x double>*
-; STORE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; STORE-NEXT:    [[TMP0:%.*]] = load double, double* [[B:%.*]], align 8
+; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; STORE-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX4]], align 8
 ; STORE-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
 ; STORE-NEXT:    br label [[FOR_BODY:%.*]]
 ; STORE:       for.body:
 ; STORE-NEXT:    [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_018]], 2
 ; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]]
+; STORE-NEXT:    [[TMP3:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+; STORE-NEXT:    [[MUL3:%.*]] = fmul fast double [[TMP0]], [[TMP3]]
 ; STORE-NEXT:    [[ADD16:%.*]] = or i64 [[MUL]], 1
 ; STORE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[ADD16]]
-; STORE-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>*
-; STORE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
-; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP4]]
-; STORE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-; STORE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
-; STORE-NEXT:    [[ADD8:%.*]] = fadd fast double [[TMP6]], [[TMP7]]
+; STORE-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX6]], align 8
+; STORE-NEXT:    [[MUL7:%.*]] = fmul fast double [[TMP1]], [[TMP4]]
+; STORE-NEXT:    [[ADD8:%.*]] = fadd fast double [[MUL3]], [[MUL7]]
 ; STORE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]]
 ; STORE-NEXT:    store double [[ADD8]], double* [[ARRAYIDX9]], align 8
 ; STORE-NEXT:    [[INC]] = add nsw i64 [[I_018]], 1
@@ -1168,16 +1167,14 @@
 ;
 ; STORE-LABEL: @float_red_example4(
 ; STORE-NEXT:  entry:
-; STORE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16
-; STORE-NEXT:    [[ADD:%.*]] = fadd fast float undef, undef
-; STORE-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
-; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP0]], [[RDX_SHUF]]
-; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; STORE-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; STORE-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
-; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
+; STORE-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
+; STORE-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
+; STORE-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
+; STORE-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
+; STORE-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; STORE-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
+; STORE-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
+; STORE-NEXT:    store float [[ADD_2]], float* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
 entry:
@@ -1371,16 +1368,14 @@
 ;
 ; STORE-LABEL: @i32_red_example4(
 ; STORE-NEXT:  entry:
-; STORE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
-; STORE-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
-; STORE-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
-; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <4 x i32> [[TMP0]], [[RDX_SHUF]]
-; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; STORE-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; STORE-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
-; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
+; STORE-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+; STORE-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+; STORE-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; STORE-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+; STORE-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; STORE-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+; STORE-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
+; STORE-NEXT:    store i32 [[ADD_2]], i32* [[RES:%.*]], align 16
 ; STORE-NEXT:    ret void
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/in-tree-user.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/in-tree-user.ll
+++ test/Transforms/SLPVectorizer/X86/in-tree-user.ll
@@ -11,22 +11,23 @@
 ; CHECK-LABEL: @in_tree_user(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[CONV]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[CONV]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> <double 7.000000e+00, double 4.000000e+00>, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> <double 5.000000e+00, double 9.000000e+00>, [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
-; CHECK-NEXT:    [[INTREEUSER:%.*]] = fadd double [[TMP8]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
-; CHECK-NEXT:    [[CMP11:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul double [[CONV]], [[TMP1]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul double [[MUL1]], 7.000000e+00
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[MUL2]], 5.000000e+00
+; CHECK-NEXT:    [[INTREEUSER:%.*]] = fadd double [[ADD]], [[ADD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[MUL8:%.*]] = fmul double [[CONV]], [[TMP3]]
+; CHECK-NEXT:    [[MUL9:%.*]] = fmul double [[MUL8]], 4.000000e+00
+; CHECK-NEXT:    [[ADD10:%.*]] = fadd double [[MUL9]], 9.000000e+00
+; CHECK-NEXT:    [[CMP11:%.*]] = fcmp ogt double [[ADD]], [[ADD10]]
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i64 0, i64 0))
Index: test/Transforms/SLPVectorizer/X86/insertvalue.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/insertvalue.ll
+++ test/Transforms/SLPVectorizer/X86/insertvalue.ll
@@ -5,23 +5,23 @@
 ; CHECK-LABEL: @julia_2xdouble(
 ; CHECK-NEXT:  top:
 ; CHECK-NEXT:    [[PX0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[X0:%.*]] = load double, double* [[PX0]], align 4
 ; CHECK-NEXT:    [[PY0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[Y0:%.*]] = load double, double* [[PY0]], align 4
+; CHECK-NEXT:    [[M0:%.*]] = fmul double [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[PX1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[PX0]] to <2 x double>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load double, double* [[PX1]], align 4
 ; CHECK-NEXT:    [[PY1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[PY0]] to <2 x double>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[Y1:%.*]] = load double, double* [[PY1]], align 4
+; CHECK-NEXT:    [[M1:%.*]] = fmul double [[X1]], [[Y1]]
 ; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[Z0:%.*]] = load double, double* [[PZ0]], align 4
+; CHECK-NEXT:    [[A0:%.*]] = fadd double [[M0]], [[Z0]]
+; CHECK-NEXT:    [[I0:%.*]] = insertvalue [2 x double] undef, double [[A0]], 0
 ; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[PZ0]] to <2 x double>*
-; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
-; CHECK-NEXT:    [[I0:%.*]] = insertvalue [2 x double] undef, double [[TMP12]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
-; CHECK-NEXT:    [[I1:%.*]] = insertvalue [2 x double] [[I0]], double [[TMP13]], 1
+; CHECK-NEXT:    [[Z1:%.*]] = load double, double* [[PZ1]], align 4
+; CHECK-NEXT:    [[A1:%.*]] = fadd double [[M1]], [[Z1]]
+; CHECK-NEXT:    [[I1:%.*]] = insertvalue [2 x double] [[I0]], double [[A1]], 1
 ; CHECK-NEXT:    store [2 x double] [[I1]], [2 x double]* [[TMP0:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
Index: test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -44,30 +44,29 @@
 ;
 ; SSE2-LABEL: @test(
 ; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = add i32 [[TMP1]], [[TMP0]]
 ; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; SSE2-NEXT:    [[MUL_29:%.*]] = add i32 [[TMP2]], [[MUL_18]]
 ; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; SSE2-NEXT:    [[MUL_310:%.*]] = add i32 [[TMP3]], [[MUL_29]]
 ; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; SSE2-NEXT:    [[MUL_411:%.*]] = add i32 [[TMP4]], [[MUL_310]]
 ; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; SSE2-NEXT:    [[MUL_512:%.*]] = add i32 [[TMP5]], [[MUL_411]]
 ; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; SSE2-NEXT:    [[MUL_613:%.*]] = add i32 [[TMP6]], [[MUL_512]]
 ; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
-; SSE2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
-; SSE2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; SSE2-NEXT:    [[MUL_18:%.*]] = add i32 undef, undef
-; SSE2-NEXT:    [[MUL_29:%.*]] = add i32 undef, [[MUL_18]]
-; SSE2-NEXT:    [[MUL_310:%.*]] = add i32 undef, [[MUL_29]]
-; SSE2-NEXT:    [[MUL_411:%.*]] = add i32 undef, [[MUL_310]]
-; SSE2-NEXT:    [[MUL_512:%.*]] = add i32 undef, [[MUL_411]]
-; SSE2-NEXT:    [[MUL_613:%.*]] = add i32 undef, [[MUL_512]]
-; SSE2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]]
-; SSE2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; SSE2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE2-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; SSE2-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
-; SSE2-NEXT:    ret i32 [[TMP2]]
+; SSE2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; SSE2-NEXT:    [[MUL_714:%.*]] = add i32 [[TMP7]], [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[MUL_714]]
 ;
 entry:
   %0 = load i32, i32* %p, align 4
Index: test/Transforms/SLPVectorizer/X86/return.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/return.ll
+++ test/Transforms/SLPVectorizer/X86/return.ll
@@ -16,12 +16,13 @@
 define double @return1() {
 ; CHECK-LABEL: @return1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([4 x double]* @a to <2 x double>*), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([4 x double]* @b to <2 x double>*), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
-; CHECK-NEXT:    [[ADD2:%.*]] = fadd double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([4 x double], [4 x double]* @a, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([4 x double], [4 x double]* @b, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[ADD0:%.*]] = fadd double [[A0]], [[B0]]
+; CHECK-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([4 x double], [4 x double]* @a, i32 0, i32 1), align 8
+; CHECK-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([4 x double], [4 x double]* @b, i32 0, i32 1), align 8
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[A1]], [[B1]]
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd double [[ADD0]], [[ADD1]]
 ; CHECK-NEXT:    ret double [[ADD2]]
 ;
 entry:
@@ -42,17 +43,16 @@
 define double @return2(double* nocapture readonly %x) {
 ; CHECK-LABEL: @return2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i32 2
+; CHECK-NEXT:    [[X0:%.*]] = load double, double* [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[X]], i32 2
+; CHECK-NEXT:    [[X2:%.*]] = load double, double* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd double [[X0]], [[X2]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i32 1
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[X]] to <2 x double>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load double, double* [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[X]], i32 3
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX1]] to <2 x double>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
-; CHECK-NEXT:    [[ADD5:%.*]] = fadd double [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[X3:%.*]] = load double, double* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd double [[X1]], [[X3]]
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd double [[ADD3]], [[ADD4]]
 ; CHECK-NEXT:    ret double [[ADD5]]
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/shift-ashr.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/shift-ashr.ll
+++ test/Transforms/SLPVectorizer/X86/shift-ashr.ll
@@ -223,22 +223,70 @@
 ; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ashr_v16i32(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; AVX1-NEXT:    [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]]
-; AVX1-NEXT:    [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]]
-; AVX1-NEXT:    [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]]
-; AVX1-NEXT:    [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]]
-; AVX1-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; AVX1-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; AVX1-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; AVX1-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; AVX1-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; AVX1-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; AVX1-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; AVX1-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; AVX1-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; AVX1-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; AVX1-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; AVX1-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; AVX1-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; AVX1-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; AVX1-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; AVX1-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; AVX1-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; AVX1-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; AVX1-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; AVX1-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; AVX1-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; AVX1-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; AVX1-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; AVX1-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; AVX1-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; AVX1-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; AVX1-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; AVX1-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; AVX1-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; AVX1-NEXT:    [[R0:%.*]] = ashr i32 [[A0]], [[B0]]
+; AVX1-NEXT:    [[R1:%.*]] = ashr i32 [[A1]], [[B1]]
+; AVX1-NEXT:    [[R2:%.*]] = ashr i32 [[A2]], [[B2]]
+; AVX1-NEXT:    [[R3:%.*]] = ashr i32 [[A3]], [[B3]]
+; AVX1-NEXT:    [[R4:%.*]] = ashr i32 [[A4]], [[B4]]
+; AVX1-NEXT:    [[R5:%.*]] = ashr i32 [[A5]], [[B5]]
+; AVX1-NEXT:    [[R6:%.*]] = ashr i32 [[A6]], [[B6]]
+; AVX1-NEXT:    [[R7:%.*]] = ashr i32 [[A7]], [[B7]]
+; AVX1-NEXT:    [[R8:%.*]] = ashr i32 [[A8]], [[B8]]
+; AVX1-NEXT:    [[R9:%.*]] = ashr i32 [[A9]], [[B9]]
+; AVX1-NEXT:    [[R10:%.*]] = ashr i32 [[A10]], [[B10]]
+; AVX1-NEXT:    [[R11:%.*]] = ashr i32 [[A11]], [[B11]]
+; AVX1-NEXT:    [[R12:%.*]] = ashr i32 [[A12]], [[B12]]
+; AVX1-NEXT:    [[R13:%.*]] = ashr i32 [[A13]], [[B13]]
+; AVX1-NEXT:    [[R14:%.*]] = ashr i32 [[A14]], [[B14]]
+; AVX1-NEXT:    [[R15:%.*]] = ashr i32 [[A15]], [[B15]]
+; AVX1-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; AVX1-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; AVX1-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; AVX1-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; AVX1-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; AVX1-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; AVX1-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; AVX1-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; AVX1-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; AVX1-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; AVX1-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; AVX1-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; AVX1-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; AVX1-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; AVX1-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; AVX1-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ashr_v16i32(
@@ -634,24 +682,321 @@
 }
 
 define void @ashr_v64i8() {
-; CHECK-LABEL: @ashr_v64i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
-; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @ashr_v64i8(
+; SSE-NEXT:    [[A0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0), align 1
+; SSE-NEXT:    [[A1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1), align 1
+; SSE-NEXT:    [[A2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2), align 1
+; SSE-NEXT:    [[A3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3), align 1
+; SSE-NEXT:    [[A4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4), align 1
+; SSE-NEXT:    [[A5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5), align 1
+; SSE-NEXT:    [[A6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6), align 1
+; SSE-NEXT:    [[A7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7), align 1
+; SSE-NEXT:    [[A8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8), align 1
+; SSE-NEXT:    [[A9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9), align 1
+; SSE-NEXT:    [[A10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+; SSE-NEXT:    [[A11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+; SSE-NEXT:    [[A12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+; SSE-NEXT:    [[A13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+; SSE-NEXT:    [[A14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+; SSE-NEXT:    [[A15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+; SSE-NEXT:    [[A16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+; SSE-NEXT:    [[A17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+; SSE-NEXT:    [[A18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+; SSE-NEXT:    [[A19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+; SSE-NEXT:    [[A20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+; SSE-NEXT:    [[A21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+; SSE-NEXT:    [[A22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+; SSE-NEXT:    [[A23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+; SSE-NEXT:    [[A24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+; SSE-NEXT:    [[A25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+; SSE-NEXT:    [[A26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+; SSE-NEXT:    [[A27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+; SSE-NEXT:    [[A28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+; SSE-NEXT:    [[A29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+; SSE-NEXT:    [[A30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+; SSE-NEXT:    [[A31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+; SSE-NEXT:    [[A32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+; SSE-NEXT:    [[A33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+; SSE-NEXT:    [[A34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+; SSE-NEXT:    [[A35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+; SSE-NEXT:    [[A36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+; SSE-NEXT:    [[A37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+; SSE-NEXT:    [[A38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+; SSE-NEXT:    [[A39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+; SSE-NEXT:    [[A40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+; SSE-NEXT:    [[A41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+; SSE-NEXT:    [[A42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+; SSE-NEXT:    [[A43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+; SSE-NEXT:    [[A44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+; SSE-NEXT:    [[A45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+; SSE-NEXT:    [[A46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+; SSE-NEXT:    [[A47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+; SSE-NEXT:    [[A48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+; SSE-NEXT:    [[A49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+; SSE-NEXT:    [[A50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+; SSE-NEXT:    [[A51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+; SSE-NEXT:    [[A52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+; SSE-NEXT:    [[A53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+; SSE-NEXT:    [[A54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+; SSE-NEXT:    [[A55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+; SSE-NEXT:    [[A56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+; SSE-NEXT:    [[A57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+; SSE-NEXT:    [[A58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+; SSE-NEXT:    [[A59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+; SSE-NEXT:    [[A60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+; SSE-NEXT:    [[A61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+; SSE-NEXT:    [[A62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+; SSE-NEXT:    [[A63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+; SSE-NEXT:    [[B0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0), align 1
+; SSE-NEXT:    [[B1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1), align 1
+; SSE-NEXT:    [[B2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2), align 1
+; SSE-NEXT:    [[B3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3), align 1
+; SSE-NEXT:    [[B4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4), align 1
+; SSE-NEXT:    [[B5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5), align 1
+; SSE-NEXT:    [[B6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6), align 1
+; SSE-NEXT:    [[B7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7), align 1
+; SSE-NEXT:    [[B8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8), align 1
+; SSE-NEXT:    [[B9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9), align 1
+; SSE-NEXT:    [[B10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+; SSE-NEXT:    [[B11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+; SSE-NEXT:    [[B12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+; SSE-NEXT:    [[B13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+; SSE-NEXT:    [[B14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+; SSE-NEXT:    [[B15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+; SSE-NEXT:    [[B16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+; SSE-NEXT:    [[B17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+; SSE-NEXT:    [[B18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+; SSE-NEXT:    [[B19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+; SSE-NEXT:    [[B20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+; SSE-NEXT:    [[B21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+; SSE-NEXT:    [[B22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+; SSE-NEXT:    [[B23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+; SSE-NEXT:    [[B24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+; SSE-NEXT:    [[B25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+; SSE-NEXT:    [[B26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+; SSE-NEXT:    [[B27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+; SSE-NEXT:    [[B28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+; SSE-NEXT:    [[B29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+; SSE-NEXT:    [[B30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+; SSE-NEXT:    [[B31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+; SSE-NEXT:    [[B32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+; SSE-NEXT:    [[B33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+; SSE-NEXT:    [[B34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+; SSE-NEXT:    [[B35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+; SSE-NEXT:    [[B36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+; SSE-NEXT:    [[B37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+; SSE-NEXT:    [[B38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+; SSE-NEXT:    [[B39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+; SSE-NEXT:    [[B40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+; SSE-NEXT:    [[B41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+; SSE-NEXT:    [[B42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+; SSE-NEXT:    [[B43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+; SSE-NEXT:    [[B44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+; SSE-NEXT:    [[B45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+; SSE-NEXT:    [[B46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+; SSE-NEXT:    [[B47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+; SSE-NEXT:    [[B48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+; SSE-NEXT:    [[B49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+; SSE-NEXT:    [[B50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+; SSE-NEXT:    [[B51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+; SSE-NEXT:    [[B52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+; SSE-NEXT:    [[B53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+; SSE-NEXT:    [[B54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+; SSE-NEXT:    [[B55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+; SSE-NEXT:    [[B56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+; SSE-NEXT:    [[B57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+; SSE-NEXT:    [[B58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+; SSE-NEXT:    [[B59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+; SSE-NEXT:    [[B60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+; SSE-NEXT:    [[B61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+; SSE-NEXT:    [[B62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+; SSE-NEXT:    [[B63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+; SSE-NEXT:    [[R0:%.*]] = ashr i8 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = ashr i8 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = ashr i8 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = ashr i8 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = ashr i8 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = ashr i8 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = ashr i8 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = ashr i8 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = ashr i8 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = ashr i8 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = ashr i8 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = ashr i8 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = ashr i8 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = ashr i8 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = ashr i8 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = ashr i8 [[A15]], [[B15]]
+; SSE-NEXT:    [[R16:%.*]] = ashr i8 [[A16]], [[B16]]
+; SSE-NEXT:    [[R17:%.*]] = ashr i8 [[A17]], [[B17]]
+; SSE-NEXT:    [[R18:%.*]] = ashr i8 [[A18]], [[B18]]
+; SSE-NEXT:    [[R19:%.*]] = ashr i8 [[A19]], [[B19]]
+; SSE-NEXT:    [[R20:%.*]] = ashr i8 [[A20]], [[B20]]
+; SSE-NEXT:    [[R21:%.*]] = ashr i8 [[A21]], [[B21]]
+; SSE-NEXT:    [[R22:%.*]] = ashr i8 [[A22]], [[B22]]
+; SSE-NEXT:    [[R23:%.*]] = ashr i8 [[A23]], [[B23]]
+; SSE-NEXT:    [[R24:%.*]] = ashr i8 [[A24]], [[B24]]
+; SSE-NEXT:    [[R25:%.*]] = ashr i8 [[A25]], [[B25]]
+; SSE-NEXT:    [[R26:%.*]] = ashr i8 [[A26]], [[B26]]
+; SSE-NEXT:    [[R27:%.*]] = ashr i8 [[A27]], [[B27]]
+; SSE-NEXT:    [[R28:%.*]] = ashr i8 [[A28]], [[B28]]
+; SSE-NEXT:    [[R29:%.*]] = ashr i8 [[A29]], [[B29]]
+; SSE-NEXT:    [[R30:%.*]] = ashr i8 [[A30]], [[B30]]
+; SSE-NEXT:    [[R31:%.*]] = ashr i8 [[A31]], [[B31]]
+; SSE-NEXT:    [[R32:%.*]] = ashr i8 [[A32]], [[B32]]
+; SSE-NEXT:    [[R33:%.*]] = ashr i8 [[A33]], [[B33]]
+; SSE-NEXT:    [[R34:%.*]] = ashr i8 [[A34]], [[B34]]
+; SSE-NEXT:    [[R35:%.*]] = ashr i8 [[A35]], [[B35]]
+; SSE-NEXT:    [[R36:%.*]] = ashr i8 [[A36]], [[B36]]
+; SSE-NEXT:    [[R37:%.*]] = ashr i8 [[A37]], [[B37]]
+; SSE-NEXT:    [[R38:%.*]] = ashr i8 [[A38]], [[B38]]
+; SSE-NEXT:    [[R39:%.*]] = ashr i8 [[A39]], [[B39]]
+; SSE-NEXT:    [[R40:%.*]] = ashr i8 [[A40]], [[B40]]
+; SSE-NEXT:    [[R41:%.*]] = ashr i8 [[A41]], [[B41]]
+; SSE-NEXT:    [[R42:%.*]] = ashr i8 [[A42]], [[B42]]
+; SSE-NEXT:    [[R43:%.*]] = ashr i8 [[A43]], [[B43]]
+; SSE-NEXT:    [[R44:%.*]] = ashr i8 [[A44]], [[B44]]
+; SSE-NEXT:    [[R45:%.*]] = ashr i8 [[A45]], [[B45]]
+; SSE-NEXT:    [[R46:%.*]] = ashr i8 [[A46]], [[B46]]
+; SSE-NEXT:    [[R47:%.*]] = ashr i8 [[A47]], [[B47]]
+; SSE-NEXT:    [[R48:%.*]] = ashr i8 [[A48]], [[B48]]
+; SSE-NEXT:    [[R49:%.*]] = ashr i8 [[A49]], [[B49]]
+; SSE-NEXT:    [[R50:%.*]] = ashr i8 [[A50]], [[B50]]
+; SSE-NEXT:    [[R51:%.*]] = ashr i8 [[A51]], [[B51]]
+; SSE-NEXT:    [[R52:%.*]] = ashr i8 [[A52]], [[B52]]
+; SSE-NEXT:    [[R53:%.*]] = ashr i8 [[A53]], [[B53]]
+; SSE-NEXT:    [[R54:%.*]] = ashr i8 [[A54]], [[B54]]
+; SSE-NEXT:    [[R55:%.*]] = ashr i8 [[A55]], [[B55]]
+; SSE-NEXT:    [[R56:%.*]] = ashr i8 [[A56]], [[B56]]
+; SSE-NEXT:    [[R57:%.*]] = ashr i8 [[A57]], [[B57]]
+; SSE-NEXT:    [[R58:%.*]] = ashr i8 [[A58]], [[B58]]
+; SSE-NEXT:    [[R59:%.*]] = ashr i8 [[A59]], [[B59]]
+; SSE-NEXT:    [[R60:%.*]] = ashr i8 [[A60]], [[B60]]
+; SSE-NEXT:    [[R61:%.*]] = ashr i8 [[A61]], [[B61]]
+; SSE-NEXT:    [[R62:%.*]] = ashr i8 [[A62]], [[B62]]
+; SSE-NEXT:    [[R63:%.*]] = ashr i8 [[A63]], [[B63]]
+; SSE-NEXT:    store i8 [[R0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0), align 1
+; SSE-NEXT:    store i8 [[R1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1), align 1
+; SSE-NEXT:    store i8 [[R2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2), align 1
+; SSE-NEXT:    store i8 [[R3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3), align 1
+; SSE-NEXT:    store i8 [[R4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4), align 1
+; SSE-NEXT:    store i8 [[R5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5), align 1
+; SSE-NEXT:    store i8 [[R6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6), align 1
+; SSE-NEXT:    store i8 [[R7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7), align 1
+; SSE-NEXT:    store i8 [[R8]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8), align 1
+; SSE-NEXT:    store i8 [[R9]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9), align 1
+; SSE-NEXT:    store i8 [[R10]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+; SSE-NEXT:    store i8 [[R11]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+; SSE-NEXT:    store i8 [[R12]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+; SSE-NEXT:    store i8 [[R13]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+; SSE-NEXT:    store i8 [[R14]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+; SSE-NEXT:    store i8 [[R15]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+; SSE-NEXT:    store i8 [[R16]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+; SSE-NEXT:    store i8 [[R17]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+; SSE-NEXT:    store i8 [[R18]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+; SSE-NEXT:    store i8 [[R19]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+; SSE-NEXT:    store i8 [[R20]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+; SSE-NEXT:    store i8 [[R21]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+; SSE-NEXT:    store i8 [[R22]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+; SSE-NEXT:    store i8 [[R23]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+; SSE-NEXT:    store i8 [[R24]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+; SSE-NEXT:    store i8 [[R25]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+; SSE-NEXT:    store i8 [[R26]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+; SSE-NEXT:    store i8 [[R27]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+; SSE-NEXT:    store i8 [[R28]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+; SSE-NEXT:    store i8 [[R29]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+; SSE-NEXT:    store i8 [[R30]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+; SSE-NEXT:    store i8 [[R31]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+; SSE-NEXT:    store i8 [[R32]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+; SSE-NEXT:    store i8 [[R33]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+; SSE-NEXT:    store i8 [[R34]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+; SSE-NEXT:    store i8 [[R35]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+; SSE-NEXT:    store i8 [[R36]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+; SSE-NEXT:    store i8 [[R37]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+; SSE-NEXT:    store i8 [[R38]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+; SSE-NEXT:    store i8 [[R39]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+; SSE-NEXT:    store i8 [[R40]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+; SSE-NEXT:    store i8 [[R41]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+; SSE-NEXT:    store i8 [[R42]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+; SSE-NEXT:    store i8 [[R43]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+; SSE-NEXT:    store i8 [[R44]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+; SSE-NEXT:    store i8 [[R45]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+; SSE-NEXT:    store i8 [[R46]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+; SSE-NEXT:    store i8 [[R47]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+; SSE-NEXT:    store i8 [[R48]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+; SSE-NEXT:    store i8 [[R49]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+; SSE-NEXT:    store i8 [[R50]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+; SSE-NEXT:    store i8 [[R51]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+; SSE-NEXT:    store i8 [[R52]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+; SSE-NEXT:    store i8 [[R53]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+; SSE-NEXT:    store i8 [[R54]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+; SSE-NEXT:    store i8 [[R55]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+; SSE-NEXT:    store i8 [[R56]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+; SSE-NEXT:    store i8 [[R57]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+; SSE-NEXT:    store i8 [[R58]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+; SSE-NEXT:    store i8 [[R59]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+; SSE-NEXT:    store i8 [[R60]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+; SSE-NEXT:    store i8 [[R61]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+; SSE-NEXT:    store i8 [[R62]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+; SSE-NEXT:    store i8 [[R63]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @ashr_v64i8(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; AVX-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; AVX-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; AVX-NEXT:    [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]]
+; AVX-NEXT:    [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]]
+; AVX-NEXT:    [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]]
+; AVX-NEXT:    [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]]
+; AVX-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; AVX-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; AVX-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; AVX-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @ashr_v64i8(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; AVX512-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; AVX512-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; AVX512-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; AVX512-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; AVX512-NEXT:    [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]]
+; AVX512-NEXT:    [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]]
+; AVX512-NEXT:    [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]]
+; AVX512-NEXT:    [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]]
+; AVX512-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; AVX512-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; AVX512-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; AVX512-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @ashr_v64i8(
+; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; XOP-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; XOP-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; XOP-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; XOP-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; XOP-NEXT:    [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]]
+; XOP-NEXT:    [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]]
+; XOP-NEXT:    [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]]
+; XOP-NEXT:    [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]]
+; XOP-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; XOP-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; XOP-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; XOP-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; XOP-NEXT:    ret void
 ;
   %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
   %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
Index: test/Transforms/SLPVectorizer/X86/shift-lshr.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/shift-lshr.ll
+++ test/Transforms/SLPVectorizer/X86/shift-lshr.ll
@@ -21,41 +21,73 @@
 
 define void @lshr_v8i64() {
 ; SSE-LABEL: @lshr_v8i64(
-; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[R0:%.*]] = lshr i64 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = lshr i64 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = lshr i64 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = lshr i64 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = lshr i64 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = lshr i64 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = lshr i64 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = lshr i64 [[A7]], [[B7]]
+; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @lshr_v8i64(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
-; AVX1-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
-; AVX1-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
-; AVX1-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[R0:%.*]] = lshr i64 [[A0]], [[B0]]
+; AVX1-NEXT:    [[R1:%.*]] = lshr i64 [[A1]], [[B1]]
+; AVX1-NEXT:    [[R2:%.*]] = lshr i64 [[A2]], [[B2]]
+; AVX1-NEXT:    [[R3:%.*]] = lshr i64 [[A3]], [[B3]]
+; AVX1-NEXT:    [[R4:%.*]] = lshr i64 [[A4]], [[B4]]
+; AVX1-NEXT:    [[R5:%.*]] = lshr i64 [[A5]], [[B5]]
+; AVX1-NEXT:    [[R6:%.*]] = lshr i64 [[A6]], [[B6]]
+; AVX1-NEXT:    [[R7:%.*]] = lshr i64 [[A7]], [[B7]]
+; AVX1-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; AVX1-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; AVX1-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; AVX1-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; AVX1-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; AVX1-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; AVX1-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; AVX1-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @lshr_v8i64(
@@ -190,16 +222,83 @@
 ; SSE-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @lshr_v16i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @lshr_v16i32(
+; AVX1-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; AVX1-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; AVX1-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; AVX1-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; AVX1-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; AVX1-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; AVX1-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; AVX1-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; AVX1-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; AVX1-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; AVX1-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; AVX1-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; AVX1-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; AVX1-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; AVX1-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; AVX1-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; AVX1-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; AVX1-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; AVX1-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; AVX1-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; AVX1-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; AVX1-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; AVX1-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; AVX1-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; AVX1-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; AVX1-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; AVX1-NEXT:    [[R0:%.*]] = lshr i32 [[A0]], [[B0]]
+; AVX1-NEXT:    [[R1:%.*]] = lshr i32 [[A1]], [[B1]]
+; AVX1-NEXT:    [[R2:%.*]] = lshr i32 [[A2]], [[B2]]
+; AVX1-NEXT:    [[R3:%.*]] = lshr i32 [[A3]], [[B3]]
+; AVX1-NEXT:    [[R4:%.*]] = lshr i32 [[A4]], [[B4]]
+; AVX1-NEXT:    [[R5:%.*]] = lshr i32 [[A5]], [[B5]]
+; AVX1-NEXT:    [[R6:%.*]] = lshr i32 [[A6]], [[B6]]
+; AVX1-NEXT:    [[R7:%.*]] = lshr i32 [[A7]], [[B7]]
+; AVX1-NEXT:    [[R8:%.*]] = lshr i32 [[A8]], [[B8]]
+; AVX1-NEXT:    [[R9:%.*]] = lshr i32 [[A9]], [[B9]]
+; AVX1-NEXT:    [[R10:%.*]] = lshr i32 [[A10]], [[B10]]
+; AVX1-NEXT:    [[R11:%.*]] = lshr i32 [[A11]], [[B11]]
+; AVX1-NEXT:    [[R12:%.*]] = lshr i32 [[A12]], [[B12]]
+; AVX1-NEXT:    [[R13:%.*]] = lshr i32 [[A13]], [[B13]]
+; AVX1-NEXT:    [[R14:%.*]] = lshr i32 [[A14]], [[B14]]
+; AVX1-NEXT:    [[R15:%.*]] = lshr i32 [[A15]], [[B15]]
+; AVX1-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; AVX1-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; AVX1-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; AVX1-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; AVX1-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; AVX1-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; AVX1-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; AVX1-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; AVX1-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; AVX1-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; AVX1-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; AVX1-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; AVX1-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; AVX1-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; AVX1-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; AVX1-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @lshr_v16i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @lshr_v16i32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
Index: test/Transforms/SLPVectorizer/X86/shift-shl.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/shift-shl.ll
+++ test/Transforms/SLPVectorizer/X86/shift-shl.ll
@@ -21,41 +21,73 @@
 
 define void @shl_v8i64() {
 ; SSE-LABEL: @shl_v8i64(
-; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[R0:%.*]] = shl i64 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = shl i64 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = shl i64 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = shl i64 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = shl i64 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = shl i64 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = shl i64 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = shl i64 [[A7]], [[B7]]
+; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX1-LABEL: @shl_v8i64(
-; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
-; AVX1-NEXT:    [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
-; AVX1-NEXT:    [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
-; AVX1-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[R0:%.*]] = shl i64 [[A0]], [[B0]]
+; AVX1-NEXT:    [[R1:%.*]] = shl i64 [[A1]], [[B1]]
+; AVX1-NEXT:    [[R2:%.*]] = shl i64 [[A2]], [[B2]]
+; AVX1-NEXT:    [[R3:%.*]] = shl i64 [[A3]], [[B3]]
+; AVX1-NEXT:    [[R4:%.*]] = shl i64 [[A4]], [[B4]]
+; AVX1-NEXT:    [[R5:%.*]] = shl i64 [[A5]], [[B5]]
+; AVX1-NEXT:    [[R6:%.*]] = shl i64 [[A6]], [[B6]]
+; AVX1-NEXT:    [[R7:%.*]] = shl i64 [[A7]], [[B7]]
+; AVX1-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; AVX1-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; AVX1-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; AVX1-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; AVX1-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; AVX1-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; AVX1-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; AVX1-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @shl_v8i64(
@@ -124,22 +156,70 @@
 
 define void @shl_v16i32() {
 ; SSE-LABEL: @shl_v16i32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = shl <4 x i32> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = shl <4 x i32> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = shl <4 x i32> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[R0:%.*]] = shl i32 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = shl i32 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = shl i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = shl i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = shl i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = shl i32 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = shl i32 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = shl i32 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = shl i32 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = shl i32 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = shl i32 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = shl i32 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = shl i32 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = shl i32 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = shl i32 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = shl i32 [[A15]], [[B15]]
+; SSE-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SSE-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SSE-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SSE-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SSE-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SSE-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SSE-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SSE-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SSE-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SSE-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SSE-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SSE-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SSE-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SSE-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SSE-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @shl_v16i32(