Index: llvm/lib/Target/PowerPC/PPCISelLowering.h
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -404,6 +404,9 @@
       /// representation.
       QBFLT,
 
+      /// Custom extend v4f32 to v2f64.
+      FP_EXTEND_LHW,
+
       /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
       /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
       /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
@@ -445,6 +448,10 @@
       /// an xxswapd.
       LXVD2X,
 
+      /// VSRC, CHAIN = LXVLHW CHAIN, Ptr - This is a floating-point load of a
+      /// v2f32 value into the lower half of a VSR register.
+      LXVLHW,
+
       /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
       /// Maps directly to an stxvd2x instruction that will be preceded by
       /// an xxswapd.
@@ -1018,6 +1025,7 @@
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -865,6 +865,7 @@
         setOperationAction(ISD::FPOWI, MVT::f128, Expand);
         setOperationAction(ISD::FREM, MVT::f128, Expand);
       }
+      setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
 
     }
 
@@ -1365,6 +1366,8 @@
   case PPCISD::QVLFSb:          return "PPCISD::QVLFSb";
   case PPCISD::BUILD_FP128:     return "PPCISD::BUILD_FP128";
   case PPCISD::EXTSWSLI:        return "PPCISD::EXTSWSLI";
+  case PPCISD::LXVLHW:          return "PPCISD::LXVLHW";
+  case PPCISD::FP_EXTEND_LHW:   return "PPCISD::FP_EXTEND_LHW";
   }
   return nullptr;
 }
@@ -9512,6 +9515,59 @@
   return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
 }
 
+// Custom lowering for fpext vf32 to v2f64
+SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+
+  assert(Op.getOpcode() == ISD::FP_EXTEND &&
+         "Should only be called for ISD::FP_EXTEND");
+
+  // return value is not MTV::v2f64 or param is not v2f32
+  if (Op.getValueType() != MVT::v2f64 ||
+      Op.getOperand(0).getValueType() != MVT::v2f32)
+    return SDValue();
+
+  SDLoc dl(Op);
+  SDValue Op0 = Op.getOperand(0);
+
+  switch (Op0.getOpcode()) {
+  default:
+    return SDValue();
+  case ISD::FADD:
+  case ISD::FMUL:
+  case ISD::FSUB: {
+    SDValue NewLoad[2];
+    for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
+      // Ensure both input are loads.
+      SDValue LdOp = Op0.getOperand(i);
+      if (LdOp.getOpcode() != ISD::LOAD)
+        return SDValue();
+      // Generate new load DAG.
+      LoadSDNode *LD = cast<LoadSDNode>(LdOp);
+      SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() };
+      NewLoad[i] =
+        DAG.getMemIntrinsicNode(PPCISD::LXVLHW, dl,
+                                DAG.getVTList(MVT::v4f32, MVT::Other),
+                                LoadOps, LD->getMemoryVT(),
+                                LD->getMemOperand());
+    }
+    SDValue newOp = DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32,
+                              NewLoad[0], NewLoad[1],
+                              Op0.getNode()->getFlags());
+    return DAG.getNode(PPCISD::FP_EXTEND_LHW, dl, MVT::v2f64, newOp);
+  }
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Op0);
+    SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() };
+    SDValue NewLd =
+      DAG.getMemIntrinsicNode(PPCISD::LXVLHW, dl,
+                              DAG.getVTList(MVT::v4f32, MVT::Other),
+                              LoadOps, LD->getMemoryVT(), LD->getMemOperand());
+    return DAG.getNode(PPCISD::FP_EXTEND_LHW, dl, MVT::v2f64, NewLd);
+  }
+  }
+  llvm_unreachable("Should never reach here!");
+}
+
 /// LowerOperation - Provide custom lowering hooks for some operations.
 ///
 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -9565,6 +9621,7 @@
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, DAG);
   case ISD::ABS:                return LowerABS(Op, DAG);
+  case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
 
   // For counter-based loop handling.
   case ISD::INTRINSIC_W_CHAIN:  return SDValue();
Index: llvm/lib/Target/PowerPC/PPCInstrVSX.td
===================================================================
--- llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -53,6 +53,15 @@
 def spilltovsrrc : RegisterOperand<SPILLTOVSRRC> {
   let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand;
 }
+
+def SDT_PPClxvlhw : SDTypeProfile<1, 1, [
+  SDTCisVT<0, v4f32>, SDTCisPtrTy<1>
+]>;
+
+def SDT_PPCfpextlhw : SDTypeProfile<1, 1, [
+  SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>
+]>;
+
 // Little-endian-specific nodes.
 def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
   SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
@@ -84,6 +93,10 @@
 def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
 def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>;
 
+def PPCfpextlhw : SDNode<"PPCISD::FP_EXTEND_LHW", SDT_PPCfpextlhw, []>;
+def PPClxvlhw : SDNode<"PPCISD::LXVLHW", SDT_PPClxvlhw,
+                       [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
 multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
                     string asmstr, InstrItinClass itin, Intrinsic Int,
                     ValueType OutTy, ValueType InTy> {
@@ -1062,6 +1075,8 @@
 def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
           (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
 
+def : Pat<(v2f64 (PPCfpextlhw v4f32:$C)), (XVCVSPDP (XXMRGHW $C, $C))>;
+
 // Loads.
 let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
   def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
@@ -3288,6 +3303,10 @@
   def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))),
             (f32 (DFLOADf32 ixaddr:$src))>;
 
+  def : Pat<(v4f32 (PPClxvlhw xaddr:$src)),
+            (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VRRC)>;
+  def : Pat<(v4f32 (PPClxvlhw ixaddr:$src)),
+            (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VRRC)>;
 
   let AddedComplexity = 400 in {
   // The following pseudoinstructions are used to ensure the utilization
Index: llvm/test/CodeGen/PowerPC/reduce_scalarization.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/PowerPC/reduce_scalarization.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:     -ppc-vsr-nums-as-vr < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names \
+; RUN:     -ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; Function Attrs: norecurse nounwind readonly
+define dso_local <2 x double> @test1(<2 x float>* nocapture readonly %Ptr) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lfd f0, 0(r3)
+; CHECK-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-NEXT:    xvcvspdp v2, vs0
+; CHECK-NEXT:    blr
+entry:
+  %0 = load <2 x float>, <2 x float>* %Ptr, align 8
+  %1 = fpext <2 x float> %0 to <2 x double>
+  ret <2 x double> %1
+}
+
+; Function Attrs: norecurse nounwind readonly
+define dso_local <2 x double> @test2(<2 x float>* nocapture readonly %a, <2 x float>* nocapture readonly %b) {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lfd f0, 0(r4)
+; CHECK-NEXT:    xxlor v2, vs0, vs0
+; CHECK-NEXT:    lfd f0, 0(r3)
+; CHECK-NEXT:    xvsubsp vs0, vs0, v2
+; CHECK-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-NEXT:    xvcvspdp v2, vs0
+; CHECK-NEXT:    blr
+entry:
+  %0 = load <2 x float>, <2 x float>* %a, align 8
+  %1 = load <2 x float>, <2 x float>* %b, align 8
+  %sub = fsub <2 x float> %0, %1
+  %2 = fpext <2 x float> %sub to <2 x double>
+  ret <2 x double> %2
+}
+
+; Function Attrs: norecurse nounwind readonly
+; Function Attrs: norecurse nounwind readonly
+define dso_local <2 x double> @test3(<2 x float>* nocapture readonly %a, <2 x float>* nocapture readonly %b) {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lfd f0, 0(r4)
+; CHECK-NEXT:    xxlor v2, vs0, vs0
+; CHECK-NEXT:    lfd f0, 0(r3)
+; CHECK-NEXT:    xvaddsp vs0, vs0, v2
+; CHECK-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-NEXT:    xvcvspdp v2, vs0
+; CHECK-NEXT:    blr
+entry:
+  %0 = load <2 x float>, <2 x float>* %a, align 8
+  %1 = load <2 x float>, <2 x float>* %b, align 8
+  %sub = fadd <2 x float> %0, %1
+  %2 = fpext <2 x float> %sub to <2 x double>
+  ret <2 x double> %2
+}
+
+; Function Attrs: norecurse nounwind readonly
+; Function Attrs: norecurse nounwind readonly
+define dso_local <2 x double> @test4(<2 x float>* nocapture readonly %a, <2 x float>* nocapture readonly %b) {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lfd f0, 0(r4)
+; CHECK-NEXT:    xxlor v2, vs0, vs0
+; CHECK-NEXT:    lfd f0, 0(r3)
+; CHECK-NEXT:    xvmulsp vs0, vs0, v2
+; CHECK-NEXT:    xxmrghw vs0, vs0, vs0
+; CHECK-NEXT:    xvcvspdp v2, vs0
+; CHECK-NEXT:    blr
+entry:
+  %0 = load <2 x float>, <2 x float>* %a, align 8
+  %1 = load <2 x float>, <2 x float>* %b, align 8
+  %sub = fmul <2 x float> %0, %1
+  %2 = fpext <2 x float> %sub to <2 x double>
+  ret <2 x double> %2
+}