Index: include/llvm/Analysis/VectorUtils.h
===================================================================
--- include/llvm/Analysis/VectorUtils.h
+++ include/llvm/Analysis/VectorUtils.h
@@ -167,6 +167,24 @@
 Constant *createSequentialMask(IRBuilder<> &Builder, unsigned Start,
                                unsigned NumInts, unsigned NumUndefs);
 
+/// Create a transpose shuffle mask.
+///
+/// This function creates a shuffle mask useful for transposing a 2xn matrix.
+/// The mask reads corresponding even- or odd-numbered vector elements from two
+/// n-dimensional source vectors and writes each result into consecutive
+/// elements of an n-dimensional destination vector. The elements of the mask
+/// begin with either zero or one depending on the value of \p IsOdd, and are
+/// of the form:
+///
+///   <0, VF + 0, 2, VF + 2, ..., VF - 2, 2 * VF - 2>, for IsOdd = false, and
+///   <1, VF + 1, 3, VF + 3, ..., VF - 1, 2 * VF - 1>, for IsOdd = true.
+///
+/// For example, the mask for VF = 4 is:
+///
+///   <0, 4, 2, 6>, for IsOdd = false, and
+///   <1, 5, 3, 7>, for IsOdd = true
+Constant *createTransposeMask(IRBuilder<> &Builder, unsigned VF, bool IsOdd);
+
 /// Concatenate a list of vectors.
 ///
 /// This function generates code that concatenate the vectors in \p Vecs into a
Index: lib/Analysis/VectorUtils.cpp
===================================================================
--- lib/Analysis/VectorUtils.cpp
+++ lib/Analysis/VectorUtils.cpp
@@ -523,6 +523,17 @@
   return ConstantVector::get(Mask);
 }
 
+Constant *llvm::createTransposeMask(IRBuilder<> &Builder, unsigned VF,
+                                    bool IsOdd) {
+  SmallVector<Constant *, 16> Mask;
+  unsigned StartVal = IsOdd ? 1 : 0;
+  for (unsigned I = 0; I < VF; I += 2)
+    for (unsigned J = 0; J < 2; ++J)
+      Mask.push_back(Builder.getInt32(StartVal + I + VF * J));
+
+  return ConstantVector::get(Mask);
+}
+
 /// A helper function for concatenating vectors. This function concatenates two
 /// vectors having the same element type. If the second vector has fewer
 /// elements than the first, it is padded with undefs.
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -747,12 +747,17 @@
     /// The TreeEntry index containing the user of this entry.  We can actually
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<int, 1> UserTreeIndices;
+
+    /// Have the operands of this tree entry been transposed to enable
+    /// vectorization?
+    SmallVector<ValueList, 4> TransposedOperands;
   };
 
   /// Create a new VectorizableTree entry.
   void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
                     ArrayRef<unsigned> ReuseShuffleIndices = None,
-                    ArrayRef<unsigned> ReorderIndices = None) {
+                    ArrayRef<unsigned> ReorderIndices = None,
+                    ArrayRef<ValueList> TransposedOperands = None) {
     VectorizableTree.emplace_back(VectorizableTree);
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
@@ -761,6 +766,8 @@
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
     Last->ReorderIndices = ReorderIndices;
+    Last->TransposedOperands.append(TransposedOperands.begin(),
+                                    TransposedOperands.end());
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
@@ -1362,6 +1369,178 @@
 
 } // end namespace llvm
 
+/// Try and transpose a bundle of binary operations if doing so would
+/// enable vectorization.
+///
+/// Given a bundle of isomorphic binary operations, we normally try to
+/// vectorize their operands by bundling all operand-zero values together and
+/// bundling all operand-one values together. Thus, assuming all operands are
+/// instructions, each operand bundle must also be isomorphic. That is, all
+/// operand-zero values must have the same opcode and all operand-one values
+/// must have the same opcode. For example:
+///
+///   ; Original expression: Both operand bundles have the same opcode (the
+///   ; operand-zero bundle is an 'add' and the operand-one bundle is a 'sub').
+///   ; The operations are vectorizable.
+///   add (add s0 s1) (sub s2, s3)
+///   add (add s4 s5) (sub s6, s7)
+///
+///   ; Vectorized IR (simplified)
+///   %a = add <s0, s4>, <s1, s5>      ; = <s0 + s1, s4 + s5>
+///   %b = sub <s2, s6>, <s3, s7>      ; = <s2 - s3, s6 - s7>
+///   %c = add %a, %b
+///
+/// However, if an operand bundle is nonisomorhpic (i.e., the instructions in
+/// the bundle have more than one opcode), it is not vectorizable. If this is
+/// the case, we can try to "transpose" the operands such that each resulting
+/// bundle will have a single opcode. For example:
+///
+///   ; Neither operand bundle is vectorizable because they each contain more
+///   ; than one opcode ('add' and 'sub'). However, we can transpose the
+///   ; bundles to enable vectorization.
+///   add (add s0 s1) (add s2, s3)
+///   add (sub s4 s5) (sub s6, s7)
+///
+///   ; Vectorized IR (simplified)
+///   %a = add <s0, s2>, <s1, s3>      ; = <s0 + s1, s2 + s3>
+///   %b = sub <s4, s6>, <s5, s7>      ; = <s4 - s5, s6 - s7>
+///   %c = shufflevector %a, %b, <0 2> ; = <s0 + s1, s4 - s5>
+///   %d = shufflevector %a, %b, <1 3> ; = <s2 + s3, s6 - s7>
+///   %e = add %c, %d
+///
+/// When we transpose the operands, we bundle together all operands of one or
+/// more instructions, rather than bundling together a single operand (e.g,
+/// operand-zero) of several instructions. Conceptually, this is a row-wise
+/// bundle instead of a column-wise bundle, hence the name "transpose". If a
+/// transpose is performed while building a vectorizable tree, we must
+/// re-transpose the vectors at run-time using shufflevector instructions to
+/// restore the correct bundling.
+///
+/// This function determines if a transpose is needed for a given bundle, and
+/// if so, returns a list of isomorphic "operand" bundles.
+static Optional<SmallVector<BoUpSLP::ValueList, 4>>
+transposeBinOpBundle(ArrayRef<Value *> VL) {
+  assert(!VL.empty() && isa<BinaryOperator>(VL[0]) && "No binary operator");
+  assert(getSameOpcode(VL).Opcode && "No single opcode");
+  assert(!getSameOpcode(VL).IsAltShuffle && "Has alternate opocde");
+
+  if (VL.size() < 2)
+    return None;
+
+  // Check that for each instruction in the given bundle, its operands are also
+  // instructions having identical opcodes.
+  if (any_of(VL, [](Value *V) {
+        auto *VLI = cast<BinaryOperator>(V);
+        auto *VLIOp0 = dyn_cast<Instruction>(VLI->getOperand(0));
+        auto *VLIOp1 = dyn_cast<Instruction>(VLI->getOperand(1));
+        return !VLIOp0 || !VLIOp1 || VLIOp0->getOpcode() != VLIOp1->getOpcode();
+      }))
+    return None;
+
+  auto *VL0 = cast<BinaryOperator>(VL[0]);
+  auto *VL0Op0 = cast<Instruction>(VL0->getOperand(0));
+  auto *VL0Op1 = cast<Instruction>(VL0->getOperand(1));
+
+  // Check if the operand-zero bundle or the operand-one bundle is isomorphic.
+  // If this is the case, a transpose is unnecessary since at least one of the
+  // bundles is already vectorizable.
+  bool Op0IsIsomorphic = true;
+  bool Op1IsIsomorphic = true;
+  for (Value *V : VL.drop_front()) {
+    auto *VLIOp0 = cast<Instruction>(cast<Instruction>(V)->getOperand(0));
+    auto *VLIOp1 = cast<Instruction>(cast<Instruction>(V)->getOperand(1));
+    Op0IsIsomorphic &= VLIOp0->getOpcode() == VL0Op0->getOpcode();
+    Op1IsIsomorphic &= VLIOp1->getOpcode() == VL0Op1->getOpcode();
+    if (!Op0IsIsomorphic && !Op1IsIsomorphic)
+      break;
+  }
+  if (Op0IsIsomorphic || Op1IsIsomorphic)
+    return None;
+
+  // Group the operands according to their opcode. If the size of any group is
+  // not a power of two, we aren't going to transpose. Note that we've already
+  // shown that both operands of a given instruction have the same opcode.
+  MapVector<unsigned, BoUpSLP::ValueList> Opcode2Operands;
+  for (Value *V : VL) {
+    auto *VLIOp0 = cast<Instruction>(cast<Instruction>(V)->getOperand(0));
+    auto *VLIOp1 = cast<Instruction>(cast<Instruction>(V)->getOperand(1));
+    unsigned SameOpcode = VLIOp0->getOpcode();
+    assert(VLIOp1->getOpcode() == SameOpcode &&
+           "Operands should have the same opcode");
+    Opcode2Operands[SameOpcode].push_back(VLIOp0);
+    Opcode2Operands[SameOpcode].push_back(VLIOp1);
+  }
+  if (any_of(Opcode2Operands,
+             [](std::pair<unsigned, BoUpSLP::ValueList> &Entry) {
+               return !isPowerOf2_32(Entry.second.size());
+             }))
+    return None;
+
+  // Find the size of the smallest isomorphic bundle. We will partition larger
+  // bundles so that each bundle is the same size.
+  unsigned MinSize =
+      std::min_element(Opcode2Operands.begin(), Opcode2Operands.end(),
+                       [](std::pair<unsigned, BoUpSLP::ValueList> &Entry1,
+                          std::pair<unsigned, BoUpSLP::ValueList> &Entry2) {
+                         return Entry1.second.size() < Entry2.second.size();
+                       })
+          ->second.size();
+
+  // Collect the final bundles while ensuring that each bundle contains MinSize
+  // values.
+  SmallVector<BoUpSLP::ValueList, 4> Bundles;
+  for (const std::pair<unsigned, BoUpSLP::ValueList> &Entry : Opcode2Operands) {
+    ArrayRef<Value *> Bundle(Entry.second);
+    while (Bundle.size() >= MinSize) {
+      ArrayRef<Value *> Slice = Bundle.take_front(MinSize);
+      Bundles.emplace_back(Slice.begin(), Slice.end());
+      Bundle = Bundle.drop_front(MinSize);
+    }
+  }
+
+  return Bundles;
+}
+
+/// Compute the cost of the shufflevector instructions needed to re-transpose
+/// binary operator bundles at run-time.
+///
+/// Unlike a normal binary operator bundle having two input bundles (one for
+/// each operand), a transposed binary operator bundle can have more than two
+/// (i.e., \p NumBundles) input bundles. We generate code for the transpose by
+/// concatenating the transposed input bundles together, and then shuffling out
+/// vectors corresponding to the operand-zero and operand-one bundles of the
+/// binary operator. So we have two kinds of shuffles to account for. We use
+/// the SK_InsertSubvector ShuffleKind for concatenating vectors, and the
+/// SK_Transpose ShuffleKind for the transpose.
+static unsigned getTransposedBinOpShuffleCost(TargetTransformInfo *TTI,
+                                              unsigned BundleVF,
+                                              unsigned NumBundles,
+                                              VectorType *VecTy) {
+  assert(isPowerOf2_32(NumBundles) && "NumBundles is not a power of 2");
+
+  // Compute the cost of concatenating NumBundles together, where each vector
+  // initially contains BundleVF elements. The result of the concatenation is
+  // two vectors with type VecTy that will be used as inputs to the
+  // shufflevector instructions performing the actual transpose. We assume the
+  // concatenation will be performed in ~log2(NumBundles) steps.
+  unsigned Cost = 0;
+  for (unsigned NumDstVecs = NumBundles / 2, ElemsPerSrcVec = BundleVF;
+       NumDstVecs >= 2; NumDstVecs /= 2, ElemsPerSrcVec *= 2) {
+    Type *SrcTy = VectorType::get(VecTy->getElementType(), ElemsPerSrcVec);
+    Type *DstTy = VectorType::get(VecTy->getElementType(), 2 * ElemsPerSrcVec);
+    Cost += NumDstVecs *
+            TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector, DstTy,
+                                ElemsPerSrcVec, SrcTy);
+  }
+
+  // Compute the cost of the transpose. The transpose requires two
+  // shufflevector instructions: one to compute the operand-zero bundle and
+  // another to compute the operand-one bundle.
+  Cost += 2 * TTI->getShuffleCost(TargetTransformInfo::SK_Transpose, VecTy);
+
+  return Cost;
+}
+
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ArrayRef<Value *> UserIgnoreLst) {
   ExtraValueToDebugLocsMap ExternallyUsedValues;
@@ -1794,9 +1973,27 @@
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
+      // If the operand bundles of this binary operator are not vectorizable,
+      // determine if we can transpose them, and if so, continue building the
+      // tree with the transposed bundles. That there is an instruction bundle
+      // for which we can compute a number of transposable operand bundles,
+      // implies that the two operands of each instruction in the bundle have
+      // the same opcode. This is mutually exclusive with swapping commutative
+      // operands, which we attempt to do next.
+      if (isa<BinaryOperator>(VL0))
+        if (Optional<SmallVector<BoUpSLP::ValueList, 4>> TransposedBundles =
+                transposeBinOpBundle(VL)) {
+          newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies,
+                       /*ReorderIndices=*/None, *TransposedBundles);
+          for (const auto &Bundle : *TransposedBundles)
+            buildTree_rec(Bundle, Depth + 1, UserTreeIdx);
+          return;
+        }
+
+      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
@@ -2293,33 +2490,46 @@
       TargetTransformInfo::OperandValueProperties Op2VP =
           TargetTransformInfo::OP_None;
 
-      // If all operands are exactly the same ConstantInt then set the
-      // operand kind to OK_UniformConstantValue.
-      // If instead not all operands are constants, then set the operand kind
-      // to OK_AnyValue. If all operands are constants but not the same,
-      // then set the operand kind to OK_NonUniformConstantValue.
-      ConstantInt *CInt = nullptr;
-      for (unsigned i = 0; i < VL.size(); ++i) {
-        const Instruction *I = cast<Instruction>(VL[i]);
-        if (!isa<ConstantInt>(I->getOperand(1))) {
-          Op2VK = TargetTransformInfo::OK_AnyValue;
-          break;
-        }
-        if (i == 0) {
-          CInt = cast<ConstantInt>(I->getOperand(1));
-          continue;
+      unsigned TransposeShuffleCost = 0;
+      SmallVector<const Value *, 4> Operands;
+
+      if (!E->TransposedOperands.empty()) {
+        // If this binary operator's operand bundles can be transposed, we need
+        // to account for the shufflevector instructions that will perform the
+        // transpose operation.
+        TransposeShuffleCost =
+            getTransposedBinOpShuffleCost(TTI, E->TransposedOperands[0].size(),
+                                          E->TransposedOperands.size(), VecTy);
+      } else {
+        // If all operands are exactly the same ConstantInt then set the
+        // operand kind to OK_UniformConstantValue.
+        // If instead not all operands are constants, then set the operand kind
+        // to OK_AnyValue. If all operands are constants but not the same,
+        // then set the operand kind to OK_NonUniformConstantValue.
+        ConstantInt *CInt = nullptr;
+        for (unsigned i = 0; i < VL.size(); ++i) {
+          const Instruction *I = cast<Instruction>(VL[i]);
+          if (!isa<ConstantInt>(I->getOperand(1))) {
+            Op2VK = TargetTransformInfo::OK_AnyValue;
+            break;
+          }
+          if (i == 0) {
+            CInt = cast<ConstantInt>(I->getOperand(1));
+            continue;
+          }
+          if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+              CInt != cast<ConstantInt>(I->getOperand(1)))
+            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
         }
-        if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
-            CInt != cast<ConstantInt>(I->getOperand(1)))
-          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+        // FIXME: Currently cost of model modification for division by power of
+        // 2 is handled for X86 and AArch64. Add support for other targets.
+        if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
+            CInt->getValue().isPowerOf2())
+          Op2VP = TargetTransformInfo::OP_PowerOf2;
+
+        Operands.append(VL0->value_op_begin(), VL0->value_op_end());
       }
-      // FIXME: Currently cost of model modification for division by power of
-      // 2 is handled for X86 and AArch64. Add support for other targets.
-      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
-          CInt->getValue().isPowerOf2())
-        Op2VP = TargetTransformInfo::OP_PowerOf2;
 
-      SmallVector<const Value *, 4> Operands(VL0->operand_values());
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -=
             (ReuseShuffleNumbers - VL.size()) *
@@ -2332,7 +2542,7 @@
                                       Op2VP, Operands);
       int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
                                                 Op1VP, Op2VP, Operands);
-      return ReuseShuffleCost + VecCost - ScalarCost;
+      return ReuseShuffleCost + TransposeShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
       TargetTransformInfo::OperandValueKind Op1VK =
@@ -3334,21 +3544,43 @@
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      ValueList LHSVL, RHSVL;
-      if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
-        reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
-                                       RHSVL);
-      else
-        for (Value *V : E->Scalars) {
-          auto *I = cast<Instruction>(V);
-          LHSVL.push_back(I->getOperand(0));
-          RHSVL.push_back(I->getOperand(1));
-        }
-
       setInsertPointAfterBundle(E->Scalars, VL0);
-
-      Value *LHS = vectorizeTree(LHSVL);
-      Value *RHS = vectorizeTree(RHSVL);
+      Value *LHS = nullptr;
+      Value *RHS = nullptr;
+      if (!E->TransposedOperands.empty()) {
+        // If this binary operator's operand bundles can be transposed,
+        // generate the shufflevector instructions that will perform the
+        // transpose operation.
+        //
+        // First, vectorize the transposed bundles and concatenate them
+        // together to form a single vector.
+        ValueList VectorizedValues;
+        for (const ValueList &Bundle : E->TransposedOperands)
+          VectorizedValues.push_back(vectorizeTree(Bundle));
+        Value *Merged = concatenateVectors(Builder, VectorizedValues);
+
+        // Next, perform the actual transpose by shuffling out the operand-zero
+        // and operand-one vectors using even and odd transpose masks.
+        Value *Undef = UndefValue::get(Merged->getType());
+        unsigned VF = cast<VectorType>(Merged->getType())->getNumElements() / 2;
+        Constant *LHSMask = createTransposeMask(Builder, VF, /*IsOdd=*/false);
+        Constant *RHSMask = createTransposeMask(Builder, VF, /*IsOdd=*/true);
+        LHS = Builder.CreateShuffleVector(Merged, Undef, LHSMask);
+        RHS = Builder.CreateShuffleVector(Merged, Undef, RHSMask);
+      } else {
+        ValueList LHSVL, RHSVL;
+        if (VL0->isCommutative()) {
+          reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL, RHSVL);
+        } else {
+          for (Value *V : E->Scalars) {
+            auto *I = cast<Instruction>(V);
+            LHSVL.push_back(I->getOperand(0));
+            RHSVL.push_back(I->getOperand(1));
+          }
+        }
+        LHS = vectorizeTree(LHSVL);
+        RHS = vectorizeTree(RHSVL);
+      }
 
       if (E->VectorizedValue) {
         DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll
===================================================================
--- test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -5,19 +5,12 @@
 
 define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 ; CHECK-LABEL: @build_vec_v2i64(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i64> %v0, i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i64> %v0, i32 1
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i64> %v1, i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i64> %v1, i32 1
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
-; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2_0]], i32 0
-; CHECK-NEXT:    [[TMP3_1:%.*]] = insertelement <2 x i64> [[TMP3_0]], i64 [[TMP2_1]], i32 1
-; CHECK-NEXT:    ret <2 x i64> [[TMP3_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> %v0, %v1
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i64> %v0, %v1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
 ;
   %v0.0 = extractelement <2 x i64> %v0, i32 0
   %v0.1 = extractelement <2 x i64> %v0, i32 1
@@ -36,21 +29,17 @@
 
 define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
 ; CHECK-LABEL: @store_chain_v2i64(
-; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i64, i64* %a, i64 1
-; CHECK-NEXT:    [[B_1:%.*]] = getelementptr i64, i64* %b, i64 1
-; CHECK-NEXT:    [[C_1:%.*]] = getelementptr i64, i64* %c, i64 1
-; CHECK-NEXT:    [[V0_0:%.*]] = load i64, i64* %a, align 8
-; CHECK-NEXT:    [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8
-; CHECK-NEXT:    [[V1_0:%.*]] = load i64, i64* %b, align 8
-; CHECK-NEXT:    [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
-; CHECK-NEXT:    store i64 [[TMP2_0]], i64* %c, align 8
-; CHECK-NEXT:    store i64 [[TMP2_1]], i64* [[C_1]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* %a to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* %b to <2 x i64>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* %c to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a.0 = getelementptr i64, i64* %a, i64 0
@@ -76,31 +65,12 @@
 
 define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <4 x i32> %v0, i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <4 x i32> %v0, i32 1
-; CHECK-NEXT:    [[V0_2:%.*]] = extractelement <4 x i32> %v0, i32 2
-; CHECK-NEXT:    [[V0_3:%.*]] = extractelement <4 x i32> %v0, i32 3
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <4 x i32> %v1, i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <4 x i32> %v1, i32 1
-; CHECK-NEXT:    [[V1_2:%.*]] = extractelement <4 x i32> %v1, i32 2
-; CHECK-NEXT:    [[V1_3:%.*]] = extractelement <4 x i32> %v1, i32 3
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP0_2:%.*]] = add i32 [[V0_2]], [[V1_2]]
-; CHECK-NEXT:    [[TMP0_3:%.*]] = add i32 [[V0_3]], [[V1_3]]
-; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1_2:%.*]] = sub i32 [[V0_2]], [[V1_2]]
-; CHECK-NEXT:    [[TMP1_3:%.*]] = sub i32 [[V0_3]], [[V1_3]]
-; CHECK-NEXT:    [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]]
-; CHECK-NEXT:    [[TMP2_2:%.*]] = add i32 [[TMP0_2]], [[TMP0_3]]
-; CHECK-NEXT:    [[TMP2_3:%.*]] = add i32 [[TMP1_2]], [[TMP1_3]]
-; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0
-; CHECK-NEXT:    [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1
-; CHECK-NEXT:    [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP2_2]], i32 2
-; CHECK-NEXT:    [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_3]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> %v0, %v1
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> %v0, %v1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
 ;
   %v0.0 = extractelement <4 x i32> %v0, i32 0
   %v0.1 = extractelement <4 x i32> %v0, i32 1
@@ -131,20 +101,12 @@
 
 define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_0(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> %v0, i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> %v0, i32 1
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> %v1, i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> %v1, i32 1
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]]
-; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0
-; CHECK-NEXT:    [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1
-; CHECK-NEXT:    [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP2_0]], i32 2
-; CHECK-NEXT:    [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_1]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> %v0, %v1
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i32> %v0, %v1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
@@ -166,26 +128,14 @@
 
 define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_1(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> %v0, i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> %v0, i32 1
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> %v1, i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> %v1, i32 1
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]]
-; CHECK-NEXT:    [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2
-; CHECK-NEXT:    [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP2_3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> %v0, %v1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> %v0, %v1
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> [[SHUFFLE1]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> [[SHUFFLE1]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1
@@ -208,26 +158,16 @@
 
 define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_3_binops(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> %v0, i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> %v0, i32 1
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> %v1, i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> %v1, i32 1
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i32> %v0, %v1
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i32> %v0, %v1
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP1_0]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_1]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP1_1]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> %v0, %v1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> %v0, %v1
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i32> %v0, %v1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i32> %v0, %v1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1
@@ -254,38 +194,18 @@
 
 define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @reduction_v4i32(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <4 x i32> %v0, i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <4 x i32> %v0, i32 1
-; CHECK-NEXT:    [[V0_2:%.*]] = extractelement <4 x i32> %v0, i32 2
-; CHECK-NEXT:    [[V0_3:%.*]] = extractelement <4 x i32> %v0, i32 3
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <4 x i32> %v1, i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <4 x i32> %v1, i32 1
-; CHECK-NEXT:    [[V1_2:%.*]] = extractelement <4 x i32> %v1, i32 2
-; CHECK-NEXT:    [[V1_3:%.*]] = extractelement <4 x i32> %v1, i32 3
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP0_2:%.*]] = add i32 [[V0_2]], [[V1_2]]
-; CHECK-NEXT:    [[TMP0_3:%.*]] = add i32 [[V0_3]], [[V1_3]]
-; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1_2:%.*]] = sub i32 [[V0_2]], [[V1_2]]
-; CHECK-NEXT:    [[TMP1_3:%.*]] = sub i32 [[V0_3]], [[V1_3]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP0_0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP0_2]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP1_2]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_1]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP0_1]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_3]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP1_3]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i32> [[TMP10]], <i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP14]])
-; CHECK-NEXT:    ret i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> %v0, %v1
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> %v0, %v1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP10]])
+; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
   %v0.0 = extractelement <4 x i32> %v0, i32 0
   %v0.1 = extractelement <4 x i32> %v0, i32 1