Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -326,6 +326,11 @@
   /// target.
   bool shouldBuildLookupTables() const;
 
+  /// \brief Return true if target always beneficiates from combining into FMA
+  /// for a given value type. This must typically return false on targets where
+  /// FMA takes more cycles to execute than FADD.
+  bool enableAggressiveFMAFusion(Type *Ty) const;
+
   /// \brief Return hardware support for population count.
   PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
 
@@ -347,6 +352,7 @@
                          Type *Ty) const;
   unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                          Type *Ty) const;
+
   /// @}
 
   /// \name Vector Target Information
@@ -525,6 +531,7 @@
   virtual unsigned getJumpBufAlignment() = 0;
   virtual unsigned getJumpBufSize() = 0;
   virtual bool shouldBuildLookupTables() = 0;
+  virtual bool enableAggressiveFMAFusion(Type *Ty) = 0;
   virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
   virtual bool haveFastSqrt(Type *Ty) = 0;
   virtual unsigned getFPOpCost(Type *Ty) = 0;
@@ -639,6 +646,9 @@
   bool shouldBuildLookupTables() override {
     return Impl.shouldBuildLookupTables();
   }
+  bool enableAggressiveFMAFusion(Type *Ty) override {
+    return Impl.enableAggressiveFMAFusion(Ty);
+  }
   PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override {
     return Impl.getPopcntSupport(IntTyWidthInBit);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -233,6 +233,8 @@
 
   bool shouldBuildLookupTables() { return true; }
 
+  bool enableAggressiveFMAFusion(Type *Ty) { return false; }
+
   TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) {
     return TTI::PSK_Software;
   }
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -182,6 +182,10 @@
            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
   }
 
+  bool enableAggressiveFMAFusion(Type *Ty) {
+    return getTLI()->enableAggressiveFMAFusion(Ty);
+  }
+
   bool haveFastSqrt(Type *Ty) {
     const TargetLoweringBase *TLI = getTLI();
     EVT VT = TLI->getValueType(Ty);
Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -298,7 +298,7 @@
   /// Return true if target always beneficiates from combining into FMA for a
   /// given value type. This must typically return false on targets where FMA
   /// takes more cycles to execute than FADD.
-  virtual bool enableAggressiveFMAFusion(EVT VT) const {
+  virtual bool enableAggressiveFMAFusion(Type *Ty) const {
     return false;
   }
 
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -139,6 +139,10 @@
   return TTIImpl->shouldBuildLookupTables();
 }
 
+bool TargetTransformInfo::enableAggressiveFMAFusion(Type *Ty) const {
+  return TTIImpl->enableAggressiveFMAFusion(Ty);
+}
+
 TargetTransformInfo::PopcntSupportKind
 TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const {
   return TTIImpl->getPopcntSupport(IntTyWidthInBit);
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6944,6 +6944,7 @@
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
   const TargetOptions &Options = DAG.getTarget().Options;
 
   // fold vector ops
@@ -7084,14 +7085,14 @@
 
     // fold (fadd (fmul x, y), z) -> (fma x, y, z)
     if (N0.getOpcode() == ISD::FMUL &&
-        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
+        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(Ty)))
       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
                          N0.getOperand(0), N0.getOperand(1), N1);
 
     // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
     // Note: Commutes FADD operands.
     if (N1.getOpcode() == ISD::FMUL &&
-        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
+        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(Ty)))
       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
                          N1.getOperand(0), N1.getOperand(1), N0);
 
@@ -7124,7 +7125,7 @@
     }
 
     // More folding opportunities when target permits.
-    if (TLI.enableAggressiveFMAFusion(VT)) {
+    if (TLI.enableAggressiveFMAFusion(Ty)) {
 
       // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
       if (N0.getOpcode() == ISD::FMA &&
@@ -7157,6 +7158,7 @@
   ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
   ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
   EVT VT = N->getValueType(0);
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
   SDLoc dl(N);
   const TargetOptions &Options = DAG.getTarget().Options;
 
@@ -7214,7 +7216,7 @@
 
     // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
     if (N0.getOpcode() == ISD::FMUL &&
-        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
+        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(Ty)))
       return DAG.getNode(ISD::FMA, dl, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(ISD::FNEG, dl, VT, N1));
@@ -7222,7 +7224,7 @@
     // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
     // Note: Commutes FSUB operands.
     if (N1.getOpcode() == ISD::FMUL &&
-        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
+        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(Ty)))
       return DAG.getNode(ISD::FMA, dl, VT,
                          DAG.getNode(ISD::FNEG, dl, VT,
                          N1.getOperand(0)),
@@ -7232,7 +7234,7 @@
     if (N0.getOpcode() == ISD::FNEG &&
         N0.getOperand(0).getOpcode() == ISD::FMUL &&
         ((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) ||
-            TLI.enableAggressiveFMAFusion(VT))) {
+            TLI.enableAggressiveFMAFusion(Ty))) {
       SDValue N00 = N0.getOperand(0).getOperand(0);
       SDValue N01 = N0.getOperand(0).getOperand(1);
       return DAG.getNode(ISD::FMA, dl, VT,
@@ -7310,7 +7312,7 @@
     }
 
     // More folding opportunities when target permits.
-    if (TLI.enableAggressiveFMAFusion(VT)) {
+    if (TLI.enableAggressiveFMAFusion(Ty)) {
 
       // fold (fsub (fma x, y, (fmul u, v)), z)
       //   -> (fma x, y (fma u, v, (fneg z)))
Index: lib/Target/NVPTX/NVPTXISelLowering.h
===================================================================
--- lib/Target/NVPTX/NVPTXISelLowering.h
+++ lib/Target/NVPTX/NVPTXISelLowering.h
@@ -508,7 +508,7 @@
 
   bool isFMAFasterThanFMulAndFAdd(EVT) const override { return true; }
 
-  bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
+  bool enableAggressiveFMAFusion(Type *Ty) const override { return true; }
 
 private:
   const NVPTXSubtarget &STI; // cache the subtarget here
Index: lib/Target/PowerPC/PPCISelLowering.h
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.h
+++ lib/Target/PowerPC/PPCISelLowering.h
@@ -410,7 +410,7 @@
     /// Return true if target always beneficiates from combining into FMA for a
     /// given value type. This must typically return false on targets where FMA
     /// takes more cycles to execute than FADD.
-    bool enableAggressiveFMAFusion(EVT VT) const override;
+    bool enableAggressiveFMAFusion(Type *Ty) const override;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
Index: lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.cpp
+++ lib/Target/PowerPC/PPCISelLowering.cpp
@@ -860,8 +860,10 @@
   return VT.changeVectorElementTypeToInteger();
 }
 
-bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
-  assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
+bool PPCTargetLowering::enableAggressiveFMAFusion(Type *Ty) const {
+  assert((Ty->isFloatingPointTy() ||
+         (Ty->isVectorTy() && Ty->getVectorElementType()->isFloatingPointTy()))
+         && "Non-floating-point FMA?");
   return true;
 }
 
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4537,6 +4537,86 @@
     return SmallUF;
   }
 
+  // Unroll if this is a large loop (small loops are already dealt with by this
+  // point) and there is a scalar reduction that could benefit from unrolling.
+  if (!UnrollingRequiresRuntimePointerCheck &&
+      Legal->getReductionVars()->size() &&
+      TheLoop->getLoopLatch()) {
+    // Typically, an inner reduction loop might have been fully unrolled at
+    // this point, which makes the outer loop large. We want to interleave the
+    // reductions in the outer loop to expose ILP opportunities. However, we
+    // need to be careful since the loop is large and we want to avoid spills.
+
+    // The heuristic used here is to divide the unroll factor by the average
+    // distance between reductions. Indeed, greater distance means it is likely
+    // that some ILP opportunities are already exposed in the loop.
+
+    if (UF > 1) {
+      // Find critical path for integer and floating-point instructions.
+      unsigned ICriticalPathLength = 0, FPCriticalPathLength = 0;
+      for (auto Redx : *Legal->getReductionVars()) {
+        unsigned PathLength = 0;
+        User *U = Redx.getFirst();
+        while (U != Redx.getSecond().LoopExitInstr) {
+          PathLength++;
+          auto I = U->users().begin();
+          U = *I++;
+          assert((I.atEnd()) &&
+                 "Expected exactly one use of reduction variable.");
+        }
+        Type *Ty = U->getType();
+        if (Ty->isIntegerTy() && ICriticalPathLength < PathLength)
+          ICriticalPathLength = PathLength;
+        else if (Ty->isFloatingPointTy() && FPCriticalPathLength < PathLength)
+          FPCriticalPathLength = PathLength;
+      }
+
+      // Retrieve number of integer and floating-point instructions.
+      unsigned ILoopLength = 0, FPLoopLength = 0;
+      for (BasicBlock::iterator I : *TheLoop->getLoopLatch()) {
+        if (I->isBinaryOp()) {
+          if (I->getType()->isIntegerTy())
+            ILoopLength++;
+          else if (I->getType()->isFloatingPointTy())
+            FPLoopLength++;
+        }
+      }
+
+      // Measure average distance between reductions in the loop. Distance for
+      // integer reductions is multiplied by 2 as latency is generally lower than
+      // for floating-point reductions.
+      unsigned IDistance = 0, FPDistance = 0;
+      if (ICriticalPathLength)
+        IDistance = (ILoopLength / ICriticalPathLength) * 2;
+      if (FPCriticalPathLength)
+        FPDistance = FPLoopLength / FPCriticalPathLength;
+
+      // If the target supports aggressive FMA fusion, it is likely that the
+      // distance will be lower than it is now as some nodes will be combined.
+      LLVMContext &Context = TheLoop->getHeader()->getContext();
+      Type *FloatTy = Type::getFloatTy(Context);
+      if (TTI.enableAggressiveFMAFusion(FloatTy) && FPDistance > 1)
+        FPDistance--;
+
+      // We are interested in knowing the minimum distance between reductions,
+      // and hide the latency accordingly.
+      unsigned MinDistance = 0;
+      if (IDistance && FPDistance)
+        MinDistance = std::min(IDistance, FPDistance);
+      else if (IDistance)
+        MinDistance = IDistance;
+      else
+        MinDistance = FPDistance;
+
+      // Reduce unroll factor to a reasonable number.
+      UF = PowerOf2Floor(UF / MinDistance);
+      if (!UF) ++UF;
+    }
+
+    DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
+    return UF;
+  }
+
   DEBUG(dbgs() << "LV: Not Unrolling.\n");
   return 1;
 }
Index: test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
===================================================================
--- test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
+++ test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
@@ -0,0 +1,68 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; CHECK: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT-NOT: fadd
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-ibm-linux-gnu"
+
+define void @QLA_F3_r_veq_norm2_V(float* noalias nocapture %r, [3 x { float, float }]* noalias nocapture readonly %a, i32 signext %n) #0 {
+entry:
+  %cmp24 = icmp sgt i32 %n, 0
+  br i1 %cmp24, label %for.cond1.preheader.lr.ph, label %for.end13
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %0 = add i32 %n, -1
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.body3
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %sum.026 = phi double [ 0.000000e+00, %for.cond1.preheader.lr.ph ], [ %add10.2, %for.body3 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader
+  %arrayidx5.realp = getelementptr inbounds [3 x { float, float }]* %a, i64 %indvars.iv, i64 0, i32 0
+  %arrayidx5.real = load float* %arrayidx5.realp, align 8
+  %arrayidx5.imagp = getelementptr inbounds [3 x { float, float }]* %a, i64 %indvars.iv, i64 0, i32 1
+  %arrayidx5.imag = load float* %arrayidx5.imagp, align 8
+  %mul = fmul fast float %arrayidx5.real, %arrayidx5.real
+  %mul9 = fmul fast float %arrayidx5.imag, %arrayidx5.imag
+  %add = fadd fast float %mul9, %mul
+  %conv = fpext float %add to double
+  %add10 = fadd fast double %conv, %sum.026
+  %arrayidx5.realp.1 = getelementptr inbounds [3 x { float, float }]* %a, i64 %indvars.iv, i64 1, i32 0
+  %arrayidx5.real.1 = load float* %arrayidx5.realp.1, align 8
+  %arrayidx5.imagp.1 = getelementptr inbounds [3 x { float, float }]* %a, i64 %indvars.iv, i64 1, i32 1
+  %arrayidx5.imag.1 = load float* %arrayidx5.imagp.1, align 8
+  %mul.1 = fmul fast float %arrayidx5.real.1, %arrayidx5.real.1
+  %mul9.1 = fmul fast float %arrayidx5.imag.1, %arrayidx5.imag.1
+  %add.1 = fadd fast float %mul9.1, %mul.1
+  %conv.1 = fpext float %add.1 to double
+  %add10.1 = fadd fast double %conv.1, %add10
+  %arrayidx5.realp.2 = getelementptr inbounds [3 x { float, float }]* %a, i64 %indvars.iv, i64 2, i32 0
+  %arrayidx5.real.2 = load float* %arrayidx5.realp.2, align 8
+  %arrayidx5.imagp.2 = getelementptr inbounds [3 x { float, float }]* %a, i64 %indvars.iv, i64 2, i32 1
+  %arrayidx5.imag.2 = load float* %arrayidx5.imagp.2, align 8
+  %mul.2 = fmul fast float %arrayidx5.real.2, %arrayidx5.real.2
+  %mul9.2 = fmul fast float %arrayidx5.imag.2, %arrayidx5.imag.2
+  %add.2 = fadd fast float %mul9.2, %mul.2
+  %conv.2 = fpext float %add.2 to double
+  %add10.2 = fadd fast double %conv.2, %add10.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.cond1.preheader, label %for.cond.for.end13_crit_edge
+
+for.cond.for.end13_crit_edge:                     ; preds = %for.body3
+  %add10.lcssa.lcssa = phi double [ %add10.2, %for.body3 ]
+  %phitmp = fptrunc double %add10.lcssa.lcssa to float
+  br label %for.end13
+
+for.end13:                                        ; preds = %for.cond.for.end13_crit_edge, %entry
+  %sum.0.lcssa = phi float [ %phitmp, %for.cond.for.end13_crit_edge ], [ 0.000000e+00, %entry ]
+  store float %sum.0.lcssa, float* %r, align 4
+  ret void
+}