Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -95,6 +95,7 @@
   bool foldExtractedCmps(Instruction &I);
   bool foldSingleElementStore(Instruction &I);
   bool scalarizeLoadExtract(Instruction &I);
+  bool foldShuffleOfBinops(Instruction &I);
 
   void replaceValue(Value &Old, Value &New) {
     Old.replaceAllUsesWith(&New);
@@ -1058,6 +1059,60 @@
   return true;
 }
 
+/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into
+/// "binop (shuffle), (shuffle)".
+bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
+  auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+  if (!VecTy)
+    return false;
+
+  BinaryOperator *B0, *B1;
+  ArrayRef<int> Mask;
+  if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)),
+                           m_Mask(Mask))) ||
+      B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy)
+    return false;
+
+  // Try to replace a binop with a shuffle if the shuffle is not costly.
+  // The new shuffle will choose from a single, common operand, so it may be
+  // cheaper than the existing two-operand shuffle.
+  SmallVector<int> UnaryMask = createUnaryMask(Mask, Mask.size());
+  Instruction::BinaryOps Opcode = B0->getOpcode();
+  InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+  InstructionCost ShufCost = TTI.getShuffleCost(
+      TargetTransformInfo::SK_PermuteSingleSrc, VecTy, UnaryMask);
+  if (ShufCost > BinopCost)
+    return false;
+
+  // If we have something like "add X, Y" and "add Z, X", swap ops to match.
+  Value *X = B0->getOperand(0), *Y = B0->getOperand(1);
+  Value *Z = B1->getOperand(0), *W = B1->getOperand(1);
+  if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W)
+    std::swap(X, Y);
+
+  Value *Shuf0, *Shuf1;
+  if (X == Z) {
+    // shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W)
+    Shuf0 = Builder.CreateShuffleVector(X, UnaryMask);
+    Shuf1 = Builder.CreateShuffleVector(Y, W, Mask);
+  } else if (Y == W) {
+    // shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y)
+    Shuf0 = Builder.CreateShuffleVector(X, Z, Mask);
+    Shuf1 = Builder.CreateShuffleVector(Y, UnaryMask);
+  } else {
+    return false;
+  }
+
+  Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
+  // Intersect flags from the old binops.
+  if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
+    NewInst->copyIRFlags(B0);
+    NewInst->andIRFlags(B1);
+  }
+  replaceValue(I, *NewBO);
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -1078,6 +1133,7 @@
     MadeChange |= foldExtractedCmps(I);
     MadeChange |= scalarizeLoadExtract(I);
     MadeChange |= foldSingleElementStore(I);
+    MadeChange |= foldShuffleOfBinops(I);
   };
   for (BasicBlock &BB : F) {
     // Ignore unreachable basic blocks.
Index: llvm/test/Transforms/VectorCombine/X86/shuffle.ll
===================================================================
--- llvm/test/Transforms/VectorCombine/X86/shuffle.ll
+++ llvm/test/Transforms/VectorCombine/X86/shuffle.ll
@@ -151,11 +151,13 @@
   ret <2 x i64> %bc3
 }
 
+; Shuffle is much cheaper than fdiv. FMF are intersected.
+
 define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; CHECK-LABEL: @shuf_fdiv_v4f32_yy(
-; CHECK-NEXT:    [[B0:%.*]] = fdiv fast <4 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = fdiv arcp <4 x float> [[Z:%.*]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> [[Z:%.*]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fdiv arcp <4 x float> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %b0 = fdiv fast <4 x float> %x, %y
@@ -164,11 +166,13 @@
   ret <4 x float> %r
 }
 
+; Common operand is op0 of the binops.
+
 define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: @shuf_add_v4i32_xx(
-; CHECK-NEXT:    [[B0:%.*]] = add <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = add <4 x i32> [[X]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 undef, i32 undef, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = add <4 x i32> %x, %y
@@ -177,11 +181,13 @@
   ret <4 x i32> %r
 }
 
+; For commutative instructions, common operand may be swapped.
+
 define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; CHECK-LABEL: @shuf_fmul_v4f32_xx_swap(
-; CHECK-NEXT:    [[B0:%.*]] = fmul <4 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = fmul <4 x float> [[Z:%.*]], [[X]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %b0 = fmul <4 x float> %x, %y
@@ -190,11 +196,13 @@
   ret <4 x float> %r
 }
 
+; For commutative instructions, common operand may be swapped.
+
 define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
 ; CHECK-LABEL: @shuf_and_v2i64_yy_swap(
-; CHECK-NEXT:    [[B0:%.*]] = and <2 x i64> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = and <2 x i64> [[Y]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i64> [[B0]], <2 x i64> [[B1]], <2 x i32> <i32 3, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[Y:%.*]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[X:%.*]], <2 x i64> [[Z:%.*]], <2 x i32> <i32 3, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
   %b0 = and <2 x i64> %x, %y
@@ -203,11 +211,13 @@
   ret <2 x i64> %r
 }
 
+; non-commutative binop, but common op0
+
 define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: @shuf_shl_v4i32_xx(
-; CHECK-NEXT:    [[B0:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = shl <4 x i32> [[X]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; CHECK-NEXT:    [[R:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = shl <4 x i32> %x, %y
@@ -216,6 +226,8 @@
   ret <4 x i32> %r
 }
 
+; negative test - common operand, but not commutable
+
 define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: @shuf_shl_v4i32_xx_swap(
 ; CHECK-NEXT:    [[B0:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
@@ -229,6 +241,8 @@
   ret <4 x i32> %r
 }
 
+; negative test - mismatched opcodes
+
 define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
 ; CHECK-LABEL: @shuf_sub_add_v2i64_yy(
 ; CHECK-NEXT:    [[B0:%.*]] = sub <2 x i64> [[X:%.*]], [[Y:%.*]]
@@ -242,6 +256,8 @@
   ret <2 x i64> %r
 }
 
+; negative test - type change via shuffle
+
 define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; CHECK-LABEL: @shuf_fmul_v4f32_xx_type(
 ; CHECK-NEXT:    [[B0:%.*]] = fmul <4 x float> [[X:%.*]], [[Y:%.*]]
@@ -255,6 +271,8 @@
   ret <8 x float> %r
 }
 
+; negative test - uses
+
 define <4 x i32> @shuf_lshr_v4i32_yy_use1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: @shuf_lshr_v4i32_yy_use1(
 ; CHECK-NEXT:    [[B0:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]]
@@ -270,6 +288,8 @@
   ret <4 x i32> %r
 }
 
+; negative test - uses
+
 define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: @shuf_mul_v4i32_yy_use2(
 ; CHECK-NEXT:    [[B0:%.*]] = mul <4 x i32> [[X:%.*]], [[Y:%.*]]
@@ -285,6 +305,8 @@
   ret <4 x i32> %r
 }
 
+; negative test - must have matching operand
+
 define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {
 ; CHECK-LABEL: @shuf_fadd_v4f32_no_common_op(
 ; CHECK-NEXT:    [[B0:%.*]] = fadd <4 x float> [[X:%.*]], [[Y:%.*]]
@@ -298,6 +320,8 @@
   ret <4 x float> %r
 }
 
+; negative test - binops may be relatively cheap
+
 define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf(<16 x i16> %x, <16 x i16> %y, <16 x i16> %z) {
 ; CHECK-LABEL: @shuf_and_v16i16_yy_expensive_shuf(
 ; CHECK-NEXT:    [[B0:%.*]] = and <16 x i16> [[X:%.*]], [[Y:%.*]]