Index: include/llvm/Transforms/Vectorize/SLPVectorizer.h
===================================================================
--- include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -138,7 +138,7 @@
   bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
   bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
-                           unsigned VecRegSize);
+                           unsigned VecRegSize, bool OnlyBitParallel);
 
   bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);
 
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -336,7 +336,7 @@
 }
 
 /// \returns analysis of the Instructions in \p VL described in
-/// InstructionsState, the Opcode that we suppose the whole list 
+/// InstructionsState, the Opcode that we suppose the whole list
 /// could be vectorized even if its structure is diverse.
 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                                        unsigned BaseIndex = 0) {
@@ -497,14 +497,14 @@
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
-  void buildTree(ArrayRef<Value *> Roots,
+  void buildTree(ArrayRef<Value *> Roots, bool OnlyBitParallel,
                  ArrayRef<Value *> UserIgnoreLst = None);
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
   /// into account (anf updating it, if required) list of externally used
   /// values stored in \p ExternallyUsedValues.
-  void buildTree(ArrayRef<Value *> Roots,
+  void buildTree(ArrayRef<Value *> Roots, bool OnlyBitParallel,
                  ExtraValueToDebugLocsMap &ExternallyUsedValues,
                  ArrayRef<Value *> UserIgnoreLst = None);
 
@@ -585,7 +585,8 @@
   int getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
-  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
+  void buildTree_rec(ArrayRef<Value *> Roots, bool OnlyBitParallel,
+                     unsigned Depth, int);
 
   /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a
@@ -1290,20 +1291,20 @@
 
 } // end namespace llvm
 
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots, bool OnlyBitParallel,
                         ArrayRef<Value *> UserIgnoreLst) {
   ExtraValueToDebugLocsMap ExternallyUsedValues;
-  buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
+  buildTree(Roots, OnlyBitParallel, ExternallyUsedValues, UserIgnoreLst);
 }
 
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots, bool OnlyBitParallel,
                         ExtraValueToDebugLocsMap &ExternallyUsedValues,
                         ArrayRef<Value *> UserIgnoreLst) {
   deleteTree();
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
-  buildTree_rec(Roots, 0, -1);
+  buildTree_rec(Roots, OnlyBitParallel, 0, -1);
 
   // Collect the values that we need to extract from the tree.
   for (TreeEntry &EIdx : VectorizableTree) {
@@ -1364,8 +1365,14 @@
   }
 }
 
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
-                            int UserTreeIdx) {
+static bool isBitParallel(unsigned Op) {
+  // FIXME: Handle ICmp, And, Or, Xor.
+  return Op == Instruction::Load || Op == Instruction::BitCast ||
+         Op == Instruction::Store;
+}
+
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, bool OnlyBitParallel,
+                            unsigned Depth, int UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
   InstructionsState S = getSameOpcode(VL);
@@ -1501,6 +1508,13 @@
 
   unsigned ShuffleOrOp = S.isAltShuffle() ?
                 (unsigned) Instruction::ShuffleVector : S.Opcode;
+
+  if (OnlyBitParallel && !isBitParallel(ShuffleOrOp)) {
+    LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+    newTreeEntry(VL, false, UserTreeIdx);
+    return;
+  }
+
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -1530,7 +1544,7 @@
           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1674,7 +1688,7 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1704,7 +1718,7 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1735,8 +1749,8 @@
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
-        buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        buildTree_rec(Left, OnlyBitParallel, Depth + 1, UserTreeIdx);
+        buildTree_rec(Right, OnlyBitParallel, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1746,7 +1760,7 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx);
       }
       return;
 
@@ -1795,7 +1809,7 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1816,7 +1830,7 @@
       for (Value *j : VL)
         Operands.push_back(cast<Instruction>(j)->getOperand(0));
 
-      buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx);
       return;
     }
     case Instruction::Call: {
@@ -1879,7 +1893,7 @@
           CallInst *CI2 = dyn_cast<CallInst>(j);
           Operands.push_back(CI2->getArgOperand(i));
         }
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1899,8 +1913,8 @@
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
         reorderAltShuffleOperands(S, VL, Left, Right);
-        buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        buildTree_rec(Left, OnlyBitParallel, Depth + 1, UserTreeIdx);
+        buildTree_rec(Right, OnlyBitParallel, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1910,7 +1924,7 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx);
       }
       return;
 
@@ -4639,7 +4653,8 @@
 }
 
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
-                                            unsigned VecRegSize) {
+                                            unsigned VecRegSize,
+                                            const bool OnlyBitParallel) {
   const unsigned ChainLen = Chain.size();
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
                     << "\n");
@@ -4664,7 +4679,7 @@
                       << "\n");
     ArrayRef<Value *> Operands = Chain.slice(i, VF);
 
-    R.buildTree(Operands);
+    R.buildTree(Operands, OnlyBitParallel);
     if (R.isTreeTinyAndNotFullyVectorizable())
       continue;
 
@@ -4761,13 +4776,25 @@
     // register size is a power-of-2?
     for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
          Size /= 2) {
-      if (vectorizeStoreChain(Operands, R, Size)) {
+      if (vectorizeStoreChain(Operands, R, Size, false)) {
         // Mark the vectorized stores so that we don't vectorize them again.
         VectorizedStores.insert(Operands.begin(), Operands.end());
         Changed = true;
         break;
       }
     }
+    // Now try to vectorize using registers that are not vector registers. Only
+    // allow operations that are instrinsically bit-parallel.
+    // FIXME: Extend to logical bitwise operations (e.g. XOR/OR/AND). We will
+    // need to check flags.
+    // FIXME: Extend to heterogeneous sizes (< 2xi8, 1xi16, 1xi32>).
+    if (!Changed && vectorizeStoreChain(
+                        Operands, R, TTI->getRegisterBitWidth(false), true)) {
+      // Mark the vectorized stores so that we don't vectorize them again.
+      VectorizedStores.insert(Operands.begin(), Operands.end());
+      Changed = true;
+      break;
+    }
   }
 
   return Changed;
@@ -4895,7 +4922,7 @@
                         << "\n");
       ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
 
-      R.buildTree(Ops);
+      R.buildTree(Ops, false);
       Optional<ArrayRef<unsigned>> Order = R.bestOrder();
       // TODO: check if we can allow reordering for more cases.
       if (AllowReorder && Order) {
@@ -4906,7 +4933,7 @@
         // there are exactly two operations.
         assert(Ops.size() == 2);
         Value *ReorderedOps[] = {Ops[1], Ops[0]};
-        R.buildTree(ReorderedOps, None);
+        R.buildTree(ReorderedOps, false, None);
       }
       if (R.isTreeTinyAndNotFullyVectorizable())
         continue;
@@ -5644,7 +5671,7 @@
       IgnoreList.append(V.begin(), V.end());
     while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
       auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
-      V.buildTree(VL, ExternallyUsedValues, IgnoreList);
+      V.buildTree(VL, false, ExternallyUsedValues, IgnoreList);
       Optional<ArrayRef<unsigned>> Order = V.bestOrder();
       // TODO: Handle orders of size less than number of elements in the vector.
       if (Order && Order->size() == VL.size()) {
@@ -5652,7 +5679,7 @@
         SmallVector<Value *, 4> ReorderedOps(VL.size());
         llvm::transform(*Order, ReorderedOps.begin(),
                         [VL](const unsigned Idx) { return VL[Idx]; });
-        V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
+        V.buildTree(ReorderedOps, false, ExternallyUsedValues, IgnoreList);
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
Index: test/Transforms/SLPVectorizer/X86/bit-parallel.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/bit-parallel.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+; This tests vectorization of bit-parallel operations (e.g. COPY) using General
+; Purpose Registers.
+;
+; four_i32 tests vectorization of 4xi32 copy. This is vectorized using a vector
+; register.
+;
+; two_i32 tests vectorization of 2xi32 copy. Copying (load/store without
+; modifications) is trivially bit-parallel and can be vectorized using a GPR.
+;
+; two_i32_swap tests vectorization of 2xi32 copy with swapping.
+;
+; two_i32_add negative-tests vectorization of 2xi32 ADD. This should NOT be
+; vectorized as ADD is not bit-parallel.
+
+
+; four_i32
+;
+;struct S {
+;  int32_t a;
+;  int32_t b;
+;  int32_t c;
+;  int32_t d;
+;  int64_t e;
+;  int32_t f;
+;};
+;
+;S copy_2xi32(const S& s) {
+;  S result;
+;  result.a = s.a;
+;  result.b = s.b;
+;  result.c = s.c;
+;  result.d = s.d;
+;  return result;
+;}
+
+%struct.S4x32 = type { i32, i32, i32, i32, i64, i32 }
+
+define void @four_i32(%struct.S4x32* noalias nocapture sret, %struct.S4x32* nocapture readonly dereferenceable(24)) {
+; CHECK-LABEL: @four_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32:%.*]], %struct.S4x32* [[TMP1:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0]], i64 0, i32 1
+; CHECK-NEXT:    [[C_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP1]], i64 0, i32 2
+; CHECK-NEXT:    [[C_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0]], i64 0, i32 2
+; CHECK-NEXT:    [[D_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP1]], i64 0, i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 8
+; CHECK-NEXT:    [[D_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0]], i64 0, i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[A_DST_PTR]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 0
+  %a = load i32, i32* %a_src_ptr, align 8
+  %a_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 0
+  store i32 %a, i32* %a_dst_ptr, align 8
+  %b_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 1
+  %b = load i32, i32* %b_src_ptr, align 8
+  %b_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 1
+  store i32 %b, i32* %b_dst_ptr, align 8
+  %c_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 2
+  %c = load i32, i32* %c_src_ptr, align 8
+  %c_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 2
+  store i32 %c, i32* %c_dst_ptr, align 8
+  %d_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 3
+  %d = load i32, i32* %d_src_ptr, align 8
+  %d_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 3
+  store i32 %d, i32* %d_dst_ptr, align 8
+  ret void
+}
+
+; two_i32
+;
+;struct S {
+;  int32_t a;
+;  int32_t b;
+;  int64_t c;
+;  int32_t d;
+;};
+;
+;S copy_2xi32(const S& s) {
+;  S result;
+;  result.a = s.a;
+;  result.b = s.b;
+;  return result;
+;}
+
+%struct.S2x32 = type { i32, i32, i64, i32 }
+
+define void @two_i32(%struct.S2x32* noalias nocapture sret, %struct.S2x32* nocapture readonly dereferenceable(24)) {
+; CHECK-LABEL: @two_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 8
+; CHECK-NEXT:    [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[A_DST_PTR]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0
+  %a = load i32, i32* %a_src_ptr, align 8
+  %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0
+  store i32 %a, i32* %a_dst_ptr, align 8
+  %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1
+  %b = load i32, i32* %b_src_ptr, align 8
+  %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1
+  store i32 %b, i32* %b_dst_ptr, align 8
+  ret void
+}
+
+define void @two_i32_swap(%struct.S2x32* noalias nocapture sret, %struct.S2x32* nocapture readonly dereferenceable(24)) {
+; CHECK-LABEL: @two_i32_swap(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 8
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[B_DST_PTR]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[REORDER_SHUFFLE]], <2 x i32>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0
+  %a = load i32, i32* %a_src_ptr, align 8
+  %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1
+  store i32 %a, i32* %a_dst_ptr, align 8
+  %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1
+  %b = load i32, i32* %b_src_ptr, align 8
+  %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0
+  store i32 %b, i32* %b_dst_ptr, align 8
+  ret void
+}
+
+define void @two_i32_add(%struct.S2x32* noalias nocapture sret, %struct.S2x32* nocapture readonly dereferenceable(24)) {
+; CHECK-LABEL: @two_i32_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_SRC_PTR]], align 8
+; CHECK-NEXT:    [[A_PLUS_1:%.*]] = add nsw i32 [[A]], 1
+; CHECK-NEXT:    [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 0
+; CHECK-NEXT:    store i32 [[A_PLUS_1]], i32* [[A_DST_PTR]], align 8
+; CHECK-NEXT:    [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_SRC_PTR]], align 8
+; CHECK-NEXT:    [[B_PLUS_1:%.*]] = add nsw i32 [[B]], 1
+; CHECK-NEXT:    [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 1
+; CHECK-NEXT:    store i32 [[B_PLUS_1]], i32* [[B_DST_PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0
+  %a = load i32, i32* %a_src_ptr, align 8
+  %a_plus_1 = add nsw i32 %a, 1
+  %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0
+  store i32 %a_plus_1, i32* %a_dst_ptr, align 8
+  %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1
+  %b = load i32, i32* %b_src_ptr, align 8
+  %b_plus_1 = add nsw i32 %b, 1
+  %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1
+  store i32 %b_plus_1, i32* %b_dst_ptr, align 8
+  ret void
+}
Index: test/Transforms/SLPVectorizer/X86/tiny-tree.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/tiny-tree.ll
+++ test/Transforms/SLPVectorizer/X86/tiny-tree.ll
@@ -172,13 +172,13 @@
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1
 ; CHECK-NEXT:    store float [[TMP1]], float* [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
-; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
-; CHECK-NEXT:    store float [[TMP3]], float* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX5]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]]
 ; CHECK-NEXT:    [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]]
 ; CHECK-NEXT:    [[INC]] = add i64 [[I_023]], 1