Index: include/llvm/Transforms/Vectorize/SLPVectorizer.h
===================================================================
--- include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -138,7 +138,7 @@
   bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
   bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
-                           unsigned VecRegSize);
+                           unsigned VecRegSize, bool OnlyBitParallel);
 
   bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);
 
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -336,7 +336,7 @@
 }
 
 /// \returns analysis of the Instructions in \p VL described in
-/// InstructionsState, the Opcode that we suppose the whole list 
+/// InstructionsState, the Opcode that we suppose the whole list
 /// could be vectorized even if its structure is diverse.
 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                                        unsigned BaseIndex = 0) {
@@ -498,6 +498,7 @@
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
   void buildTree(ArrayRef<Value *> Roots,
+                 bool IsSwar,
                  ArrayRef<Value *> UserIgnoreLst = None);
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
@@ -505,6 +506,7 @@
   /// into account (anf updating it, if required) list of externally used
   /// values stored in \p ExternallyUsedValues.
   void buildTree(ArrayRef<Value *> Roots,
+                 bool IsSwar,
                  ExtraValueToDebugLocsMap &ExternallyUsedValues,
                  ArrayRef<Value *> UserIgnoreLst = None);
 
@@ -521,6 +523,7 @@
       BS->clear();
     }
     MinBWs.clear();
+    IsSwar = false;
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -573,6 +576,9 @@
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable();
 
+  /// \returns whether the VectorizableTree has external uses.
+  bool hasExternalUses() const { return !ExternalUses.empty(); }
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
 private:
@@ -1208,6 +1214,11 @@
   /// value must be signed-extended, rather than zero-extended, back to its
   /// original width.
   MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
+
+  /// Is this a SWAR vectorization ? If true, the result type is a scalar type
+  /// and not a vector type. The "lanes" of the vector are contiguous bit
+  /// intervals (e.g. i64 is split into bits [63-32] and [31-0]).
+  bool IsSwar = false;
 };
 
 } // end namespace slpvectorizer
@@ -1291,15 +1302,18 @@
 } // end namespace llvm
 
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+                        bool IsSwar,
                         ArrayRef<Value *> UserIgnoreLst) {
   ExtraValueToDebugLocsMap ExternallyUsedValues;
-  buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
+  buildTree(Roots, IsSwar, ExternallyUsedValues, UserIgnoreLst);
 }
 
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+                        bool IsSwar,
                         ExtraValueToDebugLocsMap &ExternallyUsedValues,
                         ArrayRef<Value *> UserIgnoreLst) {
   deleteTree();
+  this->IsSwar = IsSwar;
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
@@ -1364,6 +1378,11 @@
   }
 }
 
+static bool isBitParallel(unsigned Op) {
+  // FIXME: Handle ICmp, And, Or, Xor, BitCast.
+  return Op == Instruction::Load || Op == Instruction::Store;
+}
+
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             int UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -1501,6 +1520,13 @@
 
   unsigned ShuffleOrOp = S.isAltShuffle() ?
                 (unsigned) Instruction::ShuffleVector : S.Opcode;
+
+  if (IsSwar && !isBitParallel(ShuffleOrOp)) {
+    LLVM_DEBUG(dbgs() << "SLP: Gathering due to non bit-parallel SWAR.\n");
+    newTreeEntry(VL, false, UserTreeIdx);
+    return;
+  }
+
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -1627,6 +1653,11 @@
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
             // Need to reorder.
+            if (IsSwar) {
+              LLVM_DEBUG(dbgs() << "SLP: shuffle in SWAR.\n");
+              newTreeEntry(VL, false, UserTreeIdx);
+              return;
+            }
             auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++I->getSecond();
             newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
@@ -3010,7 +3041,9 @@
   Type *ScalarTy = VL0->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
+  VectorType *const VecTy = IsSwar ? nullptr : VectorType::get(ScalarTy, E->Scalars.size());
+  IntegerType *const SwarTy = IsSwar ? IntegerType::get(F->getContext(), ScalarTy->getIntegerBitWidth() * E->Scalars.size()) : nullptr;
+  Type* const VecOrSwarTy = IsSwar ? static_cast<Type*>(SwarTy) : static_cast<Type*>(VecTy);
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
 
@@ -3306,7 +3339,7 @@
       unsigned AS = LI->getPointerAddressSpace();
 
       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
-                                            VecTy->getPointerTo(AS));
+                            VecOrSwarTy->getPointerTo(AS));
 
       // The pointer operand uses an in-tree scalar so we add the new BitCast to
       // ExternalUses list to make sure that an extract will be generated in the
@@ -3323,12 +3356,14 @@
       LI->setAlignment(Alignment);
       Value *V = propagateMetadata(LI, E->Scalars);
       if (IsReorder) {
+        assert(!IsSwar);
         OrdersType Mask;
         inversePermutation(E->ReorderIndices, Mask);
         V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
                                         Mask, "reorder_shuffle");
       }
       if (NeedToShuffleReuses) {
+        assert(!IsSwar);
         // TODO: Merge this shuffle with the ReorderShuffleMask.
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
@@ -3350,7 +3385,7 @@
 
       Value *VecValue = vectorizeTree(ScalarStoreValues);
       Value *ScalarPtr = SI->getPointerOperand();
-      Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
+      Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecOrSwarTy->getPointerTo(AS));
       StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
 
       // The pointer operand uses an in-tree scalar, so add the new BitCast to
@@ -3365,6 +3400,7 @@
       ST->setAlignment(Alignment);
       Value *V = propagateMetadata(ST, E->Scalars);
       if (NeedToShuffleReuses) {
+        assert(!IsSwar);
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
       }
@@ -3566,6 +3602,7 @@
 
   // Extract all of the elements with the external uses.
   for (const auto &ExternalUse : ExternalUses) {
+    assert(!IsSwar && "not implemented: extract in SWAR");
     Value *Scalar = ExternalUse.Scalar;
     llvm::User *User = ExternalUse.User;
 
@@ -4633,7 +4670,7 @@
 }
 
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
-                                            unsigned VecRegSize) {
+                                            unsigned VecRegSize, const bool IsSwar) {
   const unsigned ChainLen = Chain.size();
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
                     << "\n");
@@ -4658,9 +4695,13 @@
                       << "\n");
     ArrayRef<Value *> Operands = Chain.slice(i, VF);
 
-    R.buildTree(Operands);
+    R.buildTree(Operands, IsSwar);
     if (R.isTreeTinyAndNotFullyVectorizable())
       continue;
+    if (IsSwar && R.hasExternalUses()) {
+      LLVM_DEBUG(dbgs() << "SLP: Ignoring SWAR tree with external uses\n");
+      continue;
+    }
 
     R.computeMinimumValueSizes();
 
@@ -4755,13 +4796,26 @@
     // register size is a power-of-2?
     for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
          Size /= 2) {
-      if (vectorizeStoreChain(Operands, R, Size)) {
+      if (vectorizeStoreChain(Operands, R, Size, false)) {
         // Mark the vectorized stores so that we don't vectorize them again.
         VectorizedStores.insert(Operands.begin(), Operands.end());
         Changed = true;
         break;
       }
     }
+    // Now try to vectorize using SWAR (https://en.wikipedia.org/wiki/SWAR).
+    // Only allow operations that are instrinsically bit-parallel.
+    // FIXME: Extend to logical bitwise operations (e.g. XOR/OR/AND). We will
+    // need to check flags.
+    // FIXME: Extend to heterogeneous sizes (< 2xi8, 1xi16, 1xi32>). This is
+    // easy for copies but requires careful handling of shuffles to avoid
+    // generating inefficient code.
+    if (!Changed && vectorizeStoreChain(Operands, R, TTI->getRegisterBitWidth(false), true)) {
+      // Mark the vectorized stores so that we don't vectorize them again.
+      VectorizedStores.insert(Operands.begin(), Operands.end());
+      Changed = true;
+      break;
+    }
   }
 
   return Changed;
@@ -4889,7 +4943,7 @@
                         << "\n");
       ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
 
-      R.buildTree(Ops);
+      R.buildTree(Ops, false);
       Optional<ArrayRef<unsigned>> Order = R.bestOrder();
       // TODO: check if we can allow reordering for more cases.
       if (AllowReorder && Order) {
@@ -4900,7 +4954,7 @@
         // there are exactly two operations.
         assert(Ops.size() == 2);
         Value *ReorderedOps[] = {Ops[1], Ops[0]};
-        R.buildTree(ReorderedOps, None);
+        R.buildTree(ReorderedOps, false, None);
       }
       if (R.isTreeTinyAndNotFullyVectorizable())
         continue;
@@ -5638,7 +5692,7 @@
       IgnoreList.append(V.begin(), V.end());
     while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
       auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
-      V.buildTree(VL, ExternallyUsedValues, IgnoreList);
+      V.buildTree(VL, false, ExternallyUsedValues, IgnoreList);
       Optional<ArrayRef<unsigned>> Order = V.bestOrder();
       // TODO: Handle orders of size less than number of elements in the vector.
       if (Order && Order->size() == VL.size()) {
@@ -5646,7 +5700,7 @@
         SmallVector<Value *, 4> ReorderedOps(VL.size());
         llvm::transform(*Order, ReorderedOps.begin(),
                         [VL](const unsigned Idx) { return VL[Idx]; });
-        V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
+        V.buildTree(ReorderedOps, false, ExternallyUsedValues, IgnoreList);
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
Index: test/Transforms/SLPVectorizer/X86/swar.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/swar.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+; This tests vectorization of bit-parallel operations (e.g. COPY) using SWAR.
+;
+; four_i32 tests vectorization of 4xi32 copy. This is vectorized using a vector
+; register.
+;
+; two_i32 tests vectorization of 2xi32 copy. Copying (load/store without
+; modifications) is trivially bit-parallel and can be vectorized using SWAR.
+;
+; two_i32_swap tests vectorization of 2xi32 copy with swapping.
+;
+; two_i32_add negative-tests vectorization of 2xi32 ADD. This should NOT be
+; vectorized as ADD is not bit-parallel.
+
+
+; four_i32
+;
+;struct S {
+;  int32_t a;
+;  int32_t b;
+;  int32_t c;
+;  int32_t d;
+;  int64_t e;
+;  int32_t f;
+;};
+;
+;S copy_2xi32(const S& s) {
+;  S result;
+;  result.a = s.a;
+;  result.b = s.b;
+;  result.c = s.c;
+;  result.d = s.d;
+;  return result;
+;}
+
+%struct.S4x32 = type { i32, i32, i32, i32, i64, i32 }
+
+define void @four_i32(%struct.S4x32* noalias nocapture sret, %struct.S4x32* nocapture readonly dereferenceable(24)) {
+; CHECK-LABEL: @four_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32:%.*]], %struct.S4x32* [[TMP1:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0]], i64 0, i32 1
+; CHECK-NEXT:    [[C_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP1]], i64 0, i32 2
+; CHECK-NEXT:    [[C_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0]], i64 0, i32 2
+; CHECK-NEXT:    [[D_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP1]], i64 0, i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 8
+; CHECK-NEXT:    [[D_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0]], i64 0, i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[A_DST_PTR]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 0
+  %a = load i32, i32* %a_src_ptr, align 8
+  %a_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 0
+  store i32 %a, i32* %a_dst_ptr, align 8
+  %b_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 1
+  %b = load i32, i32* %b_src_ptr, align 8
+  %b_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 1
+  store i32 %b, i32* %b_dst_ptr, align 8
+  %c_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 2
+  %c = load i32, i32* %c_src_ptr, align 8
+  %c_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 2
+  store i32 %c, i32* %c_dst_ptr, align 8
+  %d_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 3
+  %d = load i32, i32* %d_src_ptr, align 8
+  %d_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 3
+  store i32 %d, i32* %d_dst_ptr, align 8
+  ret void
+}
+
+; two_i32
+;
+;struct S {
+;  int32_t a;
+;  int32_t b;
+;  int64_t c;
+;  int32_t d;
+;};
+;
+;S copy_2xi32(const S& s) {
+;  S result;
+;  result.a = s.a;
+;  result.b = s.b;
+;  return result;
+;}
+
+%struct.S2x32 = type { i32, i32, i64, i32 }
+
+define void @two_i32(%struct.S2x32* noalias nocapture sret, %struct.S2x32* nocapture readonly dereferenceable(24)) {
+; CHECK-LABEL: @two_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 8
+; CHECK-NEXT:    [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[A_DST_PTR]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0
+  %a = load i32, i32* %a_src_ptr, align 8
+  %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0
+  store i32 %a, i32* %a_dst_ptr, align 8
+  %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1
+  %b = load i32, i32* %b_src_ptr, align 8
+  %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1
+  store i32 %b, i32* %b_dst_ptr, align 8
+  ret void
+}
+
+define void @two_i32_swap(%struct.S2x32* noalias nocapture sret, %struct.S2x32* nocapture readonly dereferenceable(24)) {
+; CHECK-LABEL: @two_i32_swap(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 8
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[B_DST_PTR]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[REORDER_SHUFFLE]], <2 x i32>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0
+  %a = load i32, i32* %a_src_ptr, align 8
+  %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1
+  store i32 %a, i32* %a_dst_ptr, align 8
+  %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1
+  %b = load i32, i32* %b_src_ptr, align 8
+  %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0
+  store i32 %b, i32* %b_dst_ptr, align 8
+  ret void
+}
+
+define void @two_i32_add(%struct.S2x32* noalias nocapture sret, %struct.S2x32* nocapture readonly dereferenceable(24)) {
+; CHECK-LABEL: @two_i32_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_SRC_PTR]], align 8
+; CHECK-NEXT:    [[A_PLUS_1:%.*]] = add nsw i32 [[A]], 1
+; CHECK-NEXT:    [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 0
+; CHECK-NEXT:    store i32 [[A_PLUS_1]], i32* [[A_DST_PTR]], align 8
+; CHECK-NEXT:    [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_SRC_PTR]], align 8
+; CHECK-NEXT:    [[B_PLUS_1:%.*]] = add nsw i32 [[B]], 1
+; CHECK-NEXT:    [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 1
+; CHECK-NEXT:    store i32 [[B_PLUS_1]], i32* [[B_DST_PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0
+  %a = load i32, i32* %a_src_ptr, align 8
+  %a_plus_1 = add nsw i32 %a, 1
+  %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0
+  store i32 %a_plus_1, i32* %a_dst_ptr, align 8
+  %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1
+  %b = load i32, i32* %b_src_ptr, align 8
+  %b_plus_1 = add nsw i32 %b, 1
+  %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1
+  store i32 %b_plus_1, i32* %b_dst_ptr, align 8
+  ret void
+}
+
+define i32 @two_i32_extract(%struct.S2x32* noalias nocapture, %struct.S2x32* nocapture readonly dereferenceable(24)) {
+; CHECK-LABEL: @two_i32_extuse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 8
+; CHECK-NEXT:    [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[A_DST_PTR]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+entry:
+  %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0
+  %a = load i32, i32* %a_src_ptr, align 8
+  %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0
+  store i32 %a, i32* %a_dst_ptr, align 8
+  %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1
+  %b = load i32, i32* %b_src_ptr, align 8
+  %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1
+  store i32 %b, i32* %b_dst_ptr, align 8
+  ret i32 %b
+}
+
+define i32 @two_i32_insert(%struct.S2x32* noalias nocapture, %struct.S2x32* nocapture readonly dereferenceable(24)) {
+; CHECK-LABEL: @two_i32_extuse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 8
+; CHECK-NEXT:    [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[A_DST_PTR]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+entry:
+  %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0
+  %a = load i32, i32* %a_src_ptr, align 8
+  %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0
+  store i32 %a, i32* %a_dst_ptr, align 8
+  %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1
+  %b = load i32, i32* %b_src_ptr, align 8
+  %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1
+  store i32 %b, i32* %b_dst_ptr, align 8
+  ret i32 %b
+}
Index: test/Transforms/SLPVectorizer/X86/tiny-tree.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/tiny-tree.ll
+++ test/Transforms/SLPVectorizer/X86/tiny-tree.ll
@@ -172,13 +172,13 @@
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1
 ; CHECK-NEXT:    store float [[TMP1]], float* [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
-; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
-; CHECK-NEXT:    store float [[TMP3]], float* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX5]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]]
 ; CHECK-NEXT:    [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]]
 ; CHECK-NEXT:    [[INC]] = add i64 [[I_023]], 1