Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2441,6 +2441,13 @@
     return false;
   }
 
+  /// Try to convert a vector binary operation with splat operands into extract
+  /// vector element, a scalar operation, insert vector element, and splat.
+  virtual bool shouldScalarizeBinopSplat(SDValue VecOp,
+                                         unsigned SplatIdx) const {
+    return false;
+  }
+
   /// Try to convert math with an overflow comparison into the corresponding DAG
   /// node operation. Targets may want to override this independently of whether
   /// the operation is legal/custom for the given type because it may obscure
Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18739,6 +18739,29 @@
     }
   }
 
+  auto *ShufL = dyn_cast<ShuffleVectorSDNode>(LHS);
+  auto *ShufR = dyn_cast<ShuffleVectorSDNode>(RHS);
+  if (ShufL && ShufR && ShufL->isSplat() && ShufR->isSplat() &&
+      (ShufL->hasOneUse() || ShufR->hasOneUse())) {
+    int SplatIndex = ShufL->getSplatIndex();
+    if (SplatIndex == ShufR->getSplatIndex() &&
+        TLI.shouldScalarizeBinopSplat(SDValue(N, 0), SplatIndex)) {
+      // If insert/extract are cheap/free, we are left with only a scalar op
+      // followed by splat:
+      // bo (splat X), (splat Y) --> splat (ins (bo (ext X), (ext Y)), 0), 0
+      SDLoc DL(N);
+      EVT EltVT = VT.getScalarType();
+      SDValue C = DAG.getIntPtrConstant(SplatIndex, DL);
+      SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, LHS, C);
+      SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, RHS, C);
+      SDValue NewBO = DAG.getNode(Opcode, DL, EltVT, ExtL, ExtR, N->getFlags());
+      SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
+                                   DAG.getUNDEF(VT), NewBO,
+                                   DAG.getIntPtrConstant(0, DL));
+      SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0);
+      return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask);
+    }
+  }
   return SDValue();
 }
 
Index: llvm/lib/Target/X86/X86ISelLowering.h
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.h
+++ llvm/lib/Target/X86/X86ISelLowering.h
@@ -1068,6 +1068,13 @@
     /// supported.
     bool shouldScalarizeBinop(SDValue) const override;
 
+    /// Extract/insert of a scalar FP value from index 0 of a vector is free.
+    bool shouldScalarizeBinopSplat(SDValue VecOp,
+                                   unsigned SplatIndex) const override {
+      EVT EltVT = VecOp.getValueType().getScalarType();
+      return (EltVT == MVT::f32 || EltVT == MVT::f64) && !SplatIndex;
+    }
+
     /// Overflow nodes should get combined/lowered to optimal instructions
     /// (they should allow eliminating explicit compares by getting flags from
     /// math ops).
Index: llvm/test/CodeGen/X86/scalarize-fp.ll
===================================================================
--- llvm/test/CodeGen/X86/scalarize-fp.ll
+++ llvm/test/CodeGen/X86/scalarize-fp.ll
@@ -379,16 +379,14 @@
 define <2 x double> @fadd_splat_splat_v2f64(<2 x double> %vx, <2 x double> %vy) {
 ; SSE-LABEL: fadd_splat_splat_v2f64:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    addsd %xmm1, %xmm0
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
-; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
-; SSE-NEXT:    addpd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fadd_splat_splat_v2f64:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %splatx = shufflevector <2 x double> %vx, <2 x double> undef, <2 x i32> zeroinitializer
   %splaty = shufflevector <2 x double> %vy, <2 x double> undef, <2 x i32> zeroinitializer
@@ -396,22 +394,105 @@
   ret <2 x double> %r
 }
 
+; Negative test - splat of non-zero indexes (could still sink the splat).
+
+define <2 x double> @fadd_splat_splat_nonzero_v2f64(<2 x double> %vx, <2 x double> %vy) {
+; SSE-LABEL: fadd_splat_splat_nonzero_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fadd_splat_splat_nonzero_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,1]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %splatx = shufflevector <2 x double> %vx, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %splaty = shufflevector <2 x double> %vy, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %r = fadd <2 x double> %splatx, %splaty
+  ret <2 x double> %r
+}
+
+; Negative test - splat of non-zero index and mismatched indexes.
+
+define <2 x double> @fadd_splat_splat_mismatch_v2f64(<2 x double> %vx, <2 x double> %vy) {
+; SSE-LABEL: fadd_splat_splat_mismatch_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fadd_splat_splat_mismatch_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,1]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %splatx = shufflevector <2 x double> %vx, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %splaty = shufflevector <2 x double> %vy, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %r = fadd <2 x double> %splatx, %splaty
+  ret <2 x double> %r
+}
+
+; Negative test - non-splat.
+
+define <2 x double> @fadd_splat_nonsplat_v2f64(<2 x double> %vx, <2 x double> %vy) {
+; SSE-LABEL: fadd_splat_nonsplat_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fadd_splat_nonsplat_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %splatx = shufflevector <2 x double> %vx, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %splaty = shufflevector <2 x double> %vy, <2 x double> undef, <2 x i32> <i32 0, i32 1>
+  %r = fadd <2 x double> %splatx, %splaty
+  ret <2 x double> %r
+}
+
+; Negative test - non-FP.
+
+define <2 x i64> @add_splat_splat_v2i64(<2 x i64> %vx, <2 x i64> %vy) {
+; SSE-LABEL: add_splat_splat_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSE-NEXT:    paddq %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: add_splat_splat_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %splatx = shufflevector <2 x i64> %vx, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %splaty = shufflevector <2 x i64> %vy, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %r = add <2 x i64> %splatx, %splaty
+  ret <2 x i64> %r
+}
+
 define <4 x double> @fsub_splat_splat_v4f64(double %x, double %y) {
 ; SSE-LABEL: fsub_splat_splat_v4f64:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    subsd %xmm1, %xmm0
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
-; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
-; SSE-NEXT:    subpd %xmm1, %xmm0
 ; SSE-NEXT:    movapd %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fsub_splat_splat_v4f64:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
   %vx = insertelement <4 x double> undef, double %x, i32 0
   %vy = insertelement <4 x double> undef, double %y, i32 0
@@ -424,16 +505,14 @@
 define <4 x float> @fmul_splat_splat_v4f32(<4 x float> %vx, <4 x float> %vy) {
 ; SSE-LABEL: fmul_splat_splat_v4f32:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    mulss %xmm1, %xmm0
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT:    mulps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fmul_splat_splat_v4f32:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %splatx = shufflevector <4 x float> %vx, <4 x float> undef, <4 x i32> zeroinitializer
   %splaty = shufflevector <4 x float> %vy, <4 x float> undef, <4 x i32> zeroinitializer
@@ -444,30 +523,16 @@
 define <8 x float> @fdiv_splat_splat_v8f32(<8 x float> %vx, <8 x float> %vy) {
 ; SSE-LABEL: fdiv_splat_splat_v8f32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    rcpps %xmm2, %xmm3
-; SSE-NEXT:    mulps %xmm3, %xmm2
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT:    subps %xmm2, %xmm1
-; SSE-NEXT:    mulps %xmm3, %xmm1
-; SSE-NEXT:    addps %xmm3, %xmm1
-; SSE-NEXT:    mulps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    divss %xmm2, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT:    movaps %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fdiv_splat_splat_v8f32:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX-NEXT:    vrcpps %ymm1, %ymm2
-; AVX-NEXT:    vmulps %ymm2, %ymm1, %ymm1
-; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT:    vsubps %ymm1, %ymm3, %ymm1
-; AVX-NEXT:    vmulps %ymm1, %ymm2, %ymm1
-; AVX-NEXT:    vaddps %ymm1, %ymm2, %ymm1
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
   %splatx = shufflevector <8 x float> %vx, <8 x float> undef, <8 x i32> zeroinitializer
   %splaty = shufflevector <8 x float> %vy, <8 x float> undef, <8 x i32> zeroinitializer