Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -691,6 +691,10 @@
   /// \return The width of the smallest vector register type.
   unsigned getMinVectorRegisterBitWidth() const;
 
+  /// \return True if the target has semi vector register instructions.
+  /// For instance, packed horizontal add or subtract.
+  bool hasSemiVectorRegisterInstrs() const;
+
   /// \return True if the vectorization factor should be chosen to
   /// make the vector of the smallest element type match the size of a
   /// vector register. For wider element types, this could result in
@@ -1108,6 +1112,7 @@
   virtual unsigned getNumberOfRegisters(bool Vector) = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
+  virtual bool hasSemiVectorRegisterInstrs() const = 0;
   virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
   virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
   virtual bool shouldConsiderAddressTypePromotion(
@@ -1411,6 +1416,9 @@
   unsigned getMinVectorRegisterBitWidth() override {
     return Impl.getMinVectorRegisterBitWidth();
   }
+  bool hasSemiVectorRegisterInstrs() const override {
+    return Impl.hasSemiVectorRegisterInstrs();
+  }
   bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
     return Impl.shouldMaximizeVectorBandwidth(OptSize);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -357,6 +357,8 @@
 
   unsigned getMinVectorRegisterBitWidth() { return 128; }
 
+  bool hasSemiVectorRegisterInstrs() const { return false; }
+
   bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; }
 
   unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -502,6 +502,8 @@
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
+  bool hasSemiVectorRegisterInstrs() const { return false; }
+
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the result needs to be inserted and/or extracted from vectors.
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -344,6 +344,10 @@
   return TTIImpl->getMinVectorRegisterBitWidth();
 }
 
+bool TargetTransformInfo::hasSemiVectorRegisterInstrs() const {
+  return TTIImpl->hasSemiVectorRegisterInstrs();
+}
+
 bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const {
   return TTIImpl->shouldMaximizeVectorBandwidth(OptSize);
 }
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -60,6 +60,7 @@
 
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector) const;
+  bool hasSemiVectorRegisterInstrs() const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
   unsigned getMaxInterleaveFactor(unsigned VF);
   int getArithmeticInstrCost(
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -147,6 +147,11 @@
   return 32;
 }
 
+// X86 instruction set has horizontal adds and subtracts
+bool X86TTIImpl::hasSemiVectorRegisterInstrs() const {
+  return true;
+}
+
 unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
   return getRegisterBitWidth(true);
 }
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -136,6 +136,10 @@
     "slp-min-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
 
+static cl::opt<bool> UseSemiVecRegOption(
+    "slp-use-semi-reg", cl::init(false), cl::Hidden,
+    cl::desc("Attempt to vectorize for semi vector register size"));
+
 static cl::opt<unsigned> RecursionMaxDepth(
     "slp-recursion-max-depth", cl::init(12), cl::Hidden,
     cl::desc("Limit the recursion depth when building a vectorizable tree"));
@@ -504,6 +508,11 @@
       MinVecRegSize = MinVectorRegSizeOption;
     else
       MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
+
+    if (UseSemiVecRegOption.getNumOccurrences())
+      UseSemiVecReg = UseSemiVecRegOption;
+    else
+      UseSemiVecReg = TTI->hasSemiVectorRegisterInstrs();
   }
 
   /// Vectorize the tree that starts with the elements in \p VL.
@@ -592,6 +601,11 @@
     return MinVecRegSize;
   }
 
+  // \returns True if target has partitial vector operations as set by TTI
+  bool useSemiVecReg() const {
+    return UseSemiVecReg;
+  }
+
   /// Check if ArrayType or StructType is isomorphic to some VectorType.
   ///
   /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
@@ -1226,6 +1240,7 @@
 
   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
   unsigned MinVecRegSize; // Set by cl::opt (default: 128).
+  bool UseSemiVecReg; // This is set by TTI or overridden by cl::opt.
 
   /// Instruction builder to construct the vectorized tree.
   IRBuilder<> Builder;
@@ -4796,10 +4811,13 @@
       I = ConsecutiveChain[I];
     }
 
+    unsigned minSize = R.getMinVecRegSize();
+    if (R.useSemiVecReg())
+      minSize /= 2;
+
     // FIXME: Is division-by-2 the correct step? Should we assert that the
     // register size is a power-of-2?
-    for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
-         Size /= 2) {
+    for (unsigned Size = R.getMaxVecRegSize(); Size >= minSize; Size /= 2) {
       if (vectorizeStoreChain(Operands, R, Size)) {
         // Mark the vectorized stores so that we don't vectorize them again.
         VectorizedStores.insert(Operands.begin(), Operands.end());
Index: test/Transforms/SLPVectorizer/X86/addsub.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/addsub.ll
+++ test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -348,22 +348,18 @@
 
 define void @no_vec_shuff_reorder() #0 {
 ; CHECK-LABEL: @no_vec_shuff_reorder(
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fa to <2 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fb to <2 x float>*), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP5]], <2 x float>* bitcast ([4 x float]* @fc to <2 x float>*), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2) to <2 x float>*), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2) to <2 x float>*), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub <2 x float> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP10]], <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2) to <2 x float>*), align 4
 ; CHECK-NEXT:    ret void
 ;
   %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
Index: test/Transforms/SLPVectorizer/X86/fptosi.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/fptosi.ll
+++ test/Transforms/SLPVectorizer/X86/fptosi.ll
@@ -221,32 +221,38 @@
 }
 
 define void @fptosi_8f64_8i8() #0 {
-; CHECK-LABEL: @fptosi_8f64_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @fptosi_8f64_8i8(
+; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i8
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i8
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i8
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i8
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i8
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i8
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i8
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i8
+; SSE-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; SSE-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; SSE-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; SSE-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; SSE-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; SSE-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; SSE-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; SSE-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fptosi_8f64_8i8(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i8>
+; AVX-NEXT:    store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1
+; AVX-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -455,30 +461,9 @@
 
 define void @fptosi_8f32_8i8() #0 {
 ; CHECK-LABEL: @fptosi_8f32_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; CHECK-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; CHECK-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; CHECK-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i8>
+; CHECK-NEXT:    store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1
 ; CHECK-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
Index: test/Transforms/SLPVectorizer/X86/fptoui.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -305,32 +305,71 @@
 }
 
 define void @fptoui_8f64_8i8() #0 {
-; CHECK-LABEL: @fptoui_8f64_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @fptoui_8f64_8i8(
+; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i8
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i8
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i8
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i8
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i8
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i8
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i8
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i8
+; SSE-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; SSE-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; SSE-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; SSE-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; SSE-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; SSE-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; SSE-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; SSE-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptoui_8f64_8i8(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i8
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i8
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i8
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i8
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i8
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i8
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i8
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i8
+; AVX256NODQ-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; AVX256NODQ-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @fptoui_8f64_8i8(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i8>
+; AVX512-NEXT:    store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptoui_8f64_8i8(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i8>
+; AVX256DQ-NEXT:    store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1
+; AVX256DQ-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -616,32 +655,38 @@
 }
 
 define void @fptoui_8f32_8i8() #0 {
-; CHECK-LABEL: @fptoui_8f32_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; CHECK-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; CHECK-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; CHECK-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; CHECK-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @fptoui_8f32_8i8(
+; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i8
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i8
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i8
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i8
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i8
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i8
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i8
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i8
+; SSE-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; SSE-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; SSE-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; SSE-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; SSE-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; SSE-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; SSE-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; SSE-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fptoui_8f32_8i8(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i8>
+; AVX-NEXT:    store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1
+; AVX-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
   %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
Index: test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
+++ test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -slp-use-semi-reg=0 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
 
 define i32 @foo(i32* nocapture readonly %diff) #0 {
Index: test/Transforms/SLPVectorizer/X86/reorder_phi.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/reorder_phi.ll
+++ test/Transforms/SLPVectorizer/X86/reorder_phi.ll
@@ -9,33 +9,38 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 256, 0
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP18:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul float [[TMP5]], [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul float [[TMP7]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fsub float [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul float [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul float [[TMP5]], [[TMP11]]
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18]] = fadd float [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP19]] = fadd float [[TMP2]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP25:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP24:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> undef, float [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul <2 x float> [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fsub <2 x float> [[TMP13]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x float> [[TMP13]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x float> [[TMP21]], <2 x float> [[TMP22]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP24]] = fadd <2 x float> [[TMP2]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0
-; CHECK-NEXT:    store float [[TMP18]], float* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1
-; CHECK-NEXT:    store float [[TMP19]], float* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP27]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP24]], <2 x float>* [[TMP29]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
+++ test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
@@ -14,14 +14,10 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], <i32 31, i32 31, i32 31, i32 31>
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP1]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4
-; CHECK-NEXT:    [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31
-; CHECK-NEXT:    [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1
-; CHECK-NEXT:    store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4
-; CHECK-NEXT:    [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31
-; CHECK-NEXT:    [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1
-; CHECK-NEXT:    store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0) to <2 x i32>*), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i32> [[TMP3]], <i32 31, i32 31>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> <i32 1, i32 1>, [[TMP4]]
+; CHECK-NEXT:    store <2 x i32> [[TMP5]], <2 x i32>* bitcast (i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0) to <2 x i32>*), align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/semi-vec-reg-128bit.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/semi-vec-reg-128bit.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=SEMI128
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-use-semi-reg=0 -S | FileCheck %s --check-prefix=NOSEMI128
+
+; Check SLPVectorizer works for packed horizontal 128-bit instrs.
+; See llvm.org/PR32433
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @add_pairs_128(<4 x float>, float* nocapture) #0 {
+; SEMI128-LABEL: @add_pairs_128(
+; SEMI128-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP0:%.*]], i32 0
+; SEMI128-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; SEMI128-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; SEMI128-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; SEMI128-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0
+; SEMI128-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP5]], i32 1
+; SEMI128-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> undef, float [[TMP4]], i32 0
+; SEMI128-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP6]], i32 1
+; SEMI128-NEXT:    [[TMP11:%.*]] = fadd <2 x float> [[TMP8]], [[TMP10]]
+; SEMI128-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 1
+; SEMI128-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP1]] to <2 x float>*
+; SEMI128-NEXT:    store <2 x float> [[TMP11]], <2 x float>* [[TMP13]], align 4
+; SEMI128-NEXT:    ret void
+;
+; NOSEMI128-LABEL: @add_pairs_128(
+; NOSEMI128-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP0:%.*]], i32 0
+; NOSEMI128-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOSEMI128-NEXT:    [[TMP5:%.*]] = fadd float [[TMP3]], [[TMP4]]
+; NOSEMI128-NEXT:    store float [[TMP5]], float* [[TMP1:%.*]], align 4
+; NOSEMI128-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; NOSEMI128-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; NOSEMI128-NEXT:    [[TMP8:%.*]] = fadd float [[TMP6]], [[TMP7]]
+; NOSEMI128-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 1
+; NOSEMI128-NEXT:    store float [[TMP8]], float* [[TMP9]], align 4
+; NOSEMI128-NEXT:    ret void
+;
+  %3 = extractelement <4 x float> %0, i32 0
+  %4 = extractelement <4 x float> %0, i32 1
+  %5 = fadd float %3, %4
+  store float %5, float* %1, align 4
+  %6 = extractelement <4 x float> %0, i32 2
+  %7 = extractelement <4 x float> %0, i32 3
+  %8 = fadd float %6, %7
+  %9 = getelementptr inbounds float, float* %1, i64 1
+  store float %8, float* %9, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
Index: test/Transforms/SLPVectorizer/X86/shift-lshr.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/shift-lshr.ll
+++ test/Transforms/SLPVectorizer/X86/shift-lshr.ll
@@ -125,70 +125,38 @@
 
 define void @lshr_v16i32() {
 ; SSE-LABEL: @lshr_v16i32(
-; SSE-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
-; SSE-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
-; SSE-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
-; SSE-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
-; SSE-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
-; SSE-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
-; SSE-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
-; SSE-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
-; SSE-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
-; SSE-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
-; SSE-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
-; SSE-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
-; SSE-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
-; SSE-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
-; SSE-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
-; SSE-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
-; SSE-NEXT:    [[R0:%.*]] = lshr i32 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = lshr i32 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = lshr i32 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = lshr i32 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = lshr i32 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = lshr i32 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = lshr i32 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = lshr i32 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = lshr i32 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = lshr i32 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = lshr i32 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = lshr i32 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = lshr i32 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = lshr i32 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = lshr i32 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = lshr i32 [[A15]], [[B15]]
-; SSE-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
-; SSE-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
-; SSE-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
-; SSE-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
-; SSE-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
-; SSE-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
-; SSE-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
-; SSE-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
-; SSE-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
-; SSE-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
-; SSE-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
-; SSE-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
-; SSE-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @a32 to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @b32 to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP13:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP14:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP15:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP16:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14) to <2 x i32>*), align 4
+; SSE-NEXT:    [[TMP17:%.*]] = lshr <2 x i32> [[TMP1]], [[TMP9]]
+; SSE-NEXT:    [[TMP18:%.*]] = lshr <2 x i32> [[TMP2]], [[TMP10]]
+; SSE-NEXT:    [[TMP19:%.*]] = lshr <2 x i32> [[TMP3]], [[TMP11]]
+; SSE-NEXT:    [[TMP20:%.*]] = lshr <2 x i32> [[TMP4]], [[TMP12]]
+; SSE-NEXT:    [[TMP21:%.*]] = lshr <2 x i32> [[TMP5]], [[TMP13]]
+; SSE-NEXT:    [[TMP22:%.*]] = lshr <2 x i32> [[TMP6]], [[TMP14]]
+; SSE-NEXT:    [[TMP23:%.*]] = lshr <2 x i32> [[TMP7]], [[TMP15]]
+; SSE-NEXT:    [[TMP24:%.*]] = lshr <2 x i32> [[TMP8]], [[TMP16]]
+; SSE-NEXT:    store <2 x i32> [[TMP17]], <2 x i32>* bitcast ([16 x i32]* @c32 to <2 x i32>*), align 4
+; SSE-NEXT:    store <2 x i32> [[TMP18]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2) to <2 x i32>*), align 4
+; SSE-NEXT:    store <2 x i32> [[TMP19]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <2 x i32>*), align 4
+; SSE-NEXT:    store <2 x i32> [[TMP20]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6) to <2 x i32>*), align 4
+; SSE-NEXT:    store <2 x i32> [[TMP21]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <2 x i32>*), align 4
+; SSE-NEXT:    store <2 x i32> [[TMP22]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10) to <2 x i32>*), align 4
+; SSE-NEXT:    store <2 x i32> [[TMP23]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <2 x i32>*), align 4
+; SSE-NEXT:    store <2 x i32> [[TMP24]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14) to <2 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v16i32(
Index: test/Transforms/SLPVectorizer/X86/shift-shl.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/shift-shl.ll
+++ test/Transforms/SLPVectorizer/X86/shift-shl.ll
@@ -241,134 +241,38 @@
 
 define void @shl_v32i16() {
 ; SSE-LABEL: @shl_v32i16(
-; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
-; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
-; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
-; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
-; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
-; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
-; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
-; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
-; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
-; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
-; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
-; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
-; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
-; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
-; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
-; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
-; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
-; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
-; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
-; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
-; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
-; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
-; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
-; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
-; SSE-NEXT:    [[R0:%.*]] = shl i16 [[A0]], [[B0]]
-; SSE-NEXT:    [[R1:%.*]] = shl i16 [[A1]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = shl i16 [[A2]], [[B2]]
-; SSE-NEXT:    [[R3:%.*]] = shl i16 [[A3]], [[B3]]
-; SSE-NEXT:    [[R4:%.*]] = shl i16 [[A4]], [[B4]]
-; SSE-NEXT:    [[R5:%.*]] = shl i16 [[A5]], [[B5]]
-; SSE-NEXT:    [[R6:%.*]] = shl i16 [[A6]], [[B6]]
-; SSE-NEXT:    [[R7:%.*]] = shl i16 [[A7]], [[B7]]
-; SSE-NEXT:    [[R8:%.*]] = shl i16 [[A8]], [[B8]]
-; SSE-NEXT:    [[R9:%.*]] = shl i16 [[A9]], [[B9]]
-; SSE-NEXT:    [[R10:%.*]] = shl i16 [[A10]], [[B10]]
-; SSE-NEXT:    [[R11:%.*]] = shl i16 [[A11]], [[B11]]
-; SSE-NEXT:    [[R12:%.*]] = shl i16 [[A12]], [[B12]]
-; SSE-NEXT:    [[R13:%.*]] = shl i16 [[A13]], [[B13]]
-; SSE-NEXT:    [[R14:%.*]] = shl i16 [[A14]], [[B14]]
-; SSE-NEXT:    [[R15:%.*]] = shl i16 [[A15]], [[B15]]
-; SSE-NEXT:    [[R16:%.*]] = shl i16 [[A16]], [[B16]]
-; SSE-NEXT:    [[R17:%.*]] = shl i16 [[A17]], [[B17]]
-; SSE-NEXT:    [[R18:%.*]] = shl i16 [[A18]], [[B18]]
-; SSE-NEXT:    [[R19:%.*]] = shl i16 [[A19]], [[B19]]
-; SSE-NEXT:    [[R20:%.*]] = shl i16 [[A20]], [[B20]]
-; SSE-NEXT:    [[R21:%.*]] = shl i16 [[A21]], [[B21]]
-; SSE-NEXT:    [[R22:%.*]] = shl i16 [[A22]], [[B22]]
-; SSE-NEXT:    [[R23:%.*]] = shl i16 [[A23]], [[B23]]
-; SSE-NEXT:    [[R24:%.*]] = shl i16 [[A24]], [[B24]]
-; SSE-NEXT:    [[R25:%.*]] = shl i16 [[A25]], [[B25]]
-; SSE-NEXT:    [[R26:%.*]] = shl i16 [[A26]], [[B26]]
-; SSE-NEXT:    [[R27:%.*]] = shl i16 [[A27]], [[B27]]
-; SSE-NEXT:    [[R28:%.*]] = shl i16 [[A28]], [[B28]]
-; SSE-NEXT:    [[R29:%.*]] = shl i16 [[A29]], [[B29]]
-; SSE-NEXT:    [[R30:%.*]] = shl i16 [[A30]], [[B30]]
-; SSE-NEXT:    [[R31:%.*]] = shl i16 [[A31]], [[B31]]
-; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
-; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
-; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
-; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
-; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
-; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
-; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
-; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
-; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
-; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
-; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
-; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
-; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
-; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
-; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
-; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
-; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
-; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
-; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
-; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
-; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
-; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
-; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
-; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
-; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @a16 to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @b16 to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP14:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP15:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP16:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28) to <4 x i16>*), align 2
+; SSE-NEXT:    [[TMP17:%.*]] = shl <4 x i16> [[TMP1]], [[TMP9]]
+; SSE-NEXT:    [[TMP18:%.*]] = shl <4 x i16> [[TMP2]], [[TMP10]]
+; SSE-NEXT:    [[TMP19:%.*]] = shl <4 x i16> [[TMP3]], [[TMP11]]
+; SSE-NEXT:    [[TMP20:%.*]] = shl <4 x i16> [[TMP4]], [[TMP12]]
+; SSE-NEXT:    [[TMP21:%.*]] = shl <4 x i16> [[TMP5]], [[TMP13]]
+; SSE-NEXT:    [[TMP22:%.*]] = shl <4 x i16> [[TMP6]], [[TMP14]]
+; SSE-NEXT:    [[TMP23:%.*]] = shl <4 x i16> [[TMP7]], [[TMP15]]
+; SSE-NEXT:    [[TMP24:%.*]] = shl <4 x i16> [[TMP8]], [[TMP16]]
+; SSE-NEXT:    store <4 x i16> [[TMP17]], <4 x i16>* bitcast ([32 x i16]* @c16 to <4 x i16>*), align 2
+; SSE-NEXT:    store <4 x i16> [[TMP18]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4) to <4 x i16>*), align 2
+; SSE-NEXT:    store <4 x i16> [[TMP19]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <4 x i16>*), align 2
+; SSE-NEXT:    store <4 x i16> [[TMP20]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12) to <4 x i16>*), align 2
+; SSE-NEXT:    store <4 x i16> [[TMP21]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <4 x i16>*), align 2
+; SSE-NEXT:    store <4 x i16> [[TMP22]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20) to <4 x i16>*), align 2
+; SSE-NEXT:    store <4 x i16> [[TMP23]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <4 x i16>*), align 2
+; SSE-NEXT:    store <4 x i16> [[TMP24]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28) to <4 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @shl_v32i16(
Index: test/Transforms/SLPVectorizer/X86/sitofp.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -598,14 +598,35 @@
 ;
 
 define void @sitofp_2i64_2f32() #0 {
-; CHECK-LABEL: @sitofp_2i64_2f32(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
-; CHECK-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; CHECK-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @sitofp_2i64_2f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_2i64_2f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_2i64_2f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float>
+; AVX512-NEXT:    store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_2i64_2f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float>
+; AVX256DQ-NEXT:    store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
Index: test/Transforms/SLPVectorizer/X86/tiny-tree.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/tiny-tree.ll
+++ test/Transforms/SLPVectorizer/X86/tiny-tree.ll
@@ -172,13 +172,13 @@
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1
 ; CHECK-NEXT:    store float [[TMP1]], float* [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
-; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
-; CHECK-NEXT:    store float [[TMP3]], float* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX5]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]]
 ; CHECK-NEXT:    [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]]
 ; CHECK-NEXT:    [[INC]] = add i64 [[I_023]], 1
Index: test/Transforms/SLPVectorizer/X86/uitofp.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -550,14 +550,35 @@
 ;
 
 define void @uitofp_2i64_2f32() #0 {
-; CHECK-LABEL: @uitofp_2i64_2f32(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
-; CHECK-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
-; CHECK-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; CHECK-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @uitofp_2i64_2f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @uitofp_2i64_2f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_2i64_2f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float>
+; AVX512-NEXT:    store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_2i64_2f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float>
+; AVX256DQ-NEXT:    store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
Index: test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -47,17 +47,16 @@
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
 ; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[INCDEC_PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> <i32 1, i32 2>, [[TMP2]]
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INCDEC_PTR1]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP5]], 3
 ; CHECK-NEXT:    store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -95,13 +94,12 @@
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[INCDEC_PTR2]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i32> <i32 -2, i32 -3>, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[INCDEC_PTR3]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -214,13 +212,14 @@
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[INCDEC_PTR2]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], <i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP3]], <i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[INCDEC_PTR3]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -248,21 +247,22 @@
 ; CHECK-LABEL: @addsub1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i32> [[TMP1]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[DST]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP7]], -3
 ; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -338,17 +338,16 @@
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
 ; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SHL5:%.*]] = shl i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[INCDEC_PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <2 x i32> [[TMP2]], <i32 1, i32 2>
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INCDEC_PTR1]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP5]], 3
 ; CHECK-NEXT:    store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -457,17 +456,16 @@
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
 ; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[INCDEC_PTR]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, [[TMP2]]
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[INCDEC_PTR1]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP5]], 3.000000e+00
 ; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -505,13 +503,12 @@
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
 ; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x float> <float -2.000000e+00, float -3.000000e+00>, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[INCDEC_PTR4]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -624,13 +621,14 @@
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
 ; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[SUB5]], float* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[INCDEC_PTR3]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP6]], <2 x float>* [[TMP7]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -658,21 +656,22 @@
 ; CHECK-LABEL: @addsub1f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[TMP6]], float* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP7]], -3.000000e+00
 ; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -701,21 +700,20 @@
 ; CHECK-LABEL: @mulf(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> <float 2.570000e+02, float -3.000000e+00>, [[TMP1]]
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP2]], <2 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[TMP4]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP5]], -9.000000e+00
 ; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -786,17 +784,16 @@
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
 ; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd float [[TMP1]], 1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], 2.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[INCDEC_PTR]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> <float 1.000000e+00, float 2.000000e+00>, [[TMP2]]
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[INCDEC_PTR1]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP5]], 3.000000e+00
 ; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -834,13 +831,12 @@
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
 ; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> <float -2.000000e+00, float -3.000000e+00>, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[INCDEC_PTR4]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -944,21 +940,20 @@
 ; CHECK-LABEL: @mulfn(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fmul float [[TMP0]], 2.570000e+02
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB3:%.*]] = fmul float [[TMP1]], -3.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x float> <float 2.570000e+02, float -3.000000e+00>, [[TMP1]]
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP2]], <2 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[TMP4]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP5]], -9.000000e+00
 ; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;