diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1680,6 +1680,28 @@
       return IsSame(Scalars, ReuseShuffleIndices);
     }
 
+    /// \returns true if current entry has same operands as \p TE.
+    bool hasEqualOperands(const TreeEntry &TE) const {
+      if (TE.getNumOperands() != getNumOperands())
+        return false;
+      SmallBitVector Used(getNumOperands());
+      for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
+        unsigned PrevCount = Used.count();
+        for (unsigned K = 0; K < E; ++K) {
+          if (Used.test(K))
+            continue;
+          if (getOperand(K) == TE.getOperand(I)) {
+            Used.set(K);
+            break;
+          }
+        }
+        // Check if we actually found the matching operand.
+        if (PrevCount == Used.count())
+          return false;
+      }
+      return true;
+    }
+
     /// \return Final vectorization factor for the node. Defined by the total
     /// number of vectorized scalars, including those, used several times in the
     /// entry and counted in the \a ReuseShuffleIndices, if any.
@@ -1773,6 +1795,12 @@
       return Operands[OpIdx];
     }
 
+    /// \returns the \p OpIdx operand of this TreeEntry.
+    ArrayRef<Value *> getOperand(unsigned OpIdx) const {
+      assert(OpIdx < Operands.size() && "Off bounds");
+      return Operands[OpIdx];
+    }
+
     /// \returns the number of operands.
     unsigned getNumOperands() const { return Operands.size(); }
 
@@ -2078,7 +2106,7 @@
   SmallPtrSet<const Value *, 32> EphValues;
 
   /// Holds all of the instructions that we gathered.
-  SetVector<Instruction *> GatherSeq;
+  SetVector<Instruction *> GatherShuffleSeq;
 
   /// A list of blocks that we are going to CSE.
   SetVector<BasicBlock *> CSEBlocks;
@@ -5016,7 +5044,30 @@
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
       InstructionCost VecCost = 0;
-      if (Instruction::isBinaryOp(E->getOpcode())) {
+      // Try to find the previous shuffle node with the same operands and same
+      // main/alternate ops.
+      auto &&TryFindNodeWithEqualOperands = [this, E]() {
+        for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+          if (TE.get() == E)
+            break;
+          if (TE->isAltShuffle() &&
+              ((TE->getOpcode() == E->getOpcode() &&
+                TE->getAltOpcode() == E->getAltOpcode()) ||
+               (TE->getOpcode() == E->getAltOpcode() &&
+                TE->getAltOpcode() == E->getOpcode())) &&
+              TE->hasEqualOperands(*E))
+            return true;
+        }
+        return false;
+      };
+      if (TryFindNodeWithEqualOperands()) {
+        LLVM_DEBUG({
+          dbgs() << "SLP: diamond match for alternate node found.\n";
+          E->dump();
+        });
+        // No need to add new vector costs here since we're going to reuse
+        // same main/alternate vector ops, just do different shuffling.
+      } else if (Instruction::isBinaryOp(E->getOpcode())) {
         VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
         VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
                                                CostKind);
@@ -5728,7 +5779,7 @@
     auto *InsElt = dyn_cast<InsertElementInst>(Vec);
     if (!InsElt)
       return Vec;
-    GatherSeq.insert(InsElt);
+    GatherShuffleSeq.insert(InsElt);
     CSEBlocks.insert(InsElt->getParent());
     // Add to our 'need-to-extract' list.
     if (TreeEntry *Entry = getTreeEntry(V)) {
@@ -5915,7 +5966,7 @@
     ShuffleBuilder.addMask(ReuseShuffleIndicies);
     Vec = ShuffleBuilder.finalize(Vec);
     if (auto *I = dyn_cast<Instruction>(Vec)) {
-      GatherSeq.insert(I);
+      GatherShuffleSeq.insert(I);
       CSEBlocks.insert(I->getParent());
     }
   }
@@ -5953,7 +6004,7 @@
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       Vec = ShuffleBuilder.finalize(Vec);
       if (auto *I = dyn_cast<Instruction>(Vec)) {
-        GatherSeq.insert(I);
+        GatherShuffleSeq.insert(I);
         CSEBlocks.insert(I->getParent());
       }
     }
@@ -6444,6 +6495,14 @@
         V1 = Builder.CreateCast(
             static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
       }
+      // Add V0 and V1 to later analysis to try to find and remove matching
+      // instruction, if any.
+      for (Value *V : {V0, V1}) {
+        if (auto *I = dyn_cast<Instruction>(V)) {
+          GatherShuffleSeq.insert(I);
+          CSEBlocks.insert(I->getParent());
+        }
+      }
 
       // Create shuffle to take alternate operations from the vector.
       // Also, gather up main and alt scalar ops to propagate IR flags to
@@ -6657,10 +6716,10 @@
 }
 
 void BoUpSLP::optimizeGatherSequence() {
-  LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+  LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size()
                     << " gather sequences instructions.\n");
   // LICM InsertElementInst sequences.
-  for (Instruction *I : GatherSeq) {
+  for (Instruction *I : GatherShuffleSeq) {
     if (isDeleted(I))
       continue;
 
@@ -6719,7 +6778,7 @@
       if (isDeleted(&In))
         continue;
       if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) &&
-          !isa<ShuffleVectorInst>(&In))
+          !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In))
         continue;
 
       // Check if we can replace this instruction with any of the
@@ -6741,7 +6800,7 @@
     }
   }
   CSEBlocks.clear();
-  GatherSeq.clear();
+  GatherShuffleSeq.clear();
 }
 
 // Groups the instructions to a bundle (which is then a single scheduling entity)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -9,11 +9,9 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
 ;
   %v0.0 = extractelement <2 x i64> %v0, i32 0
   %v0.1 = extractelement <2 x i64> %v0, i32 1
@@ -39,12 +37,10 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a.0 = getelementptr i64, i64* %a, i64 0
@@ -73,11 +69,9 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 3, i32 6>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
 ;
   %v0.0 = extractelement <4 x i32> %v0, i32 0
   %v0.1 = extractelement <4 x i32> %v0, i32 1
@@ -111,11 +105,9 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
@@ -224,17 +216,15 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 7, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i32> [[TMP8]], <i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
-; CHECK-NEXT:    ret i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
+; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
   %v0.0 = extractelement <4 x i32> %v0, i32 0
   %v0.1 = extractelement <4 x i32> %v0, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -9,11 +9,9 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
 ;
   %v0.0 = extractelement <2 x i64> %v0, i32 0
   %v0.1 = extractelement <2 x i64> %v0, i32 1
@@ -39,12 +37,10 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a.0 = getelementptr i64, i64* %a, i64 0
@@ -73,11 +69,9 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 3, i32 6>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
 ;
   %v0.0 = extractelement <4 x i32> %v0, i32 0
   %v0.1 = extractelement <4 x i32> %v0, i32 1
@@ -111,11 +105,9 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
@@ -224,17 +216,15 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 7, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i32> [[TMP8]], <i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
-; CHECK-NEXT:    ret i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
+; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
   %v0.0 = extractelement <4 x i32> %v0, i32 0
   %v0.1 = extractelement <4 x i32> %v0, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll
@@ -8,7 +8,7 @@
 ; YAML-NEXT:  Function:        build_vec_v2i64
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-8'
+; YAML-NEXT:    - Cost:            '-10'
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '8'
 
@@ -17,11 +17,9 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
 ;
   %v0.0 = extractelement <2 x i64> %v0, i32 0
   %v0.1 = extractelement <2 x i64> %v0, i32 1