Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -247,6 +247,12 @@
     cl::desc(
         "Enable runtime interleaving until load/store ports are saturated"));
 
+/// Interleave small loops with scalar reductions.
+static cl::opt<bool> InterleaveSmallLoopScalarReduction(
+    "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
+    cl::desc("Enable interleaving for loops with small iteration counts that "
+             "contain scalar reductions to expose ILP."));
+
 /// The number of stores in a loop that are allowed to need predication.
 static cl::opt<unsigned> NumberOfStoresToPredicate(
     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -5408,10 +5414,15 @@
   if (Legal->getMaxSafeDepDistBytes() != -1U)
     return 1;
 
-  // Do not interleave loops with a relatively small known or estimated trip
-  // count.
   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
-  if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
+  const bool HasReductions = !Legal->getReductionVars().empty();
+  // Do not interleave loops with a relatively small known or estimated trip
+  // count. But we will interleave when InterleaveSmallLoopScalarReduction is
+  // enabled, and the code has scalar reductions(HasReductions && VF = 1),
+  // because with the above conditions interleaving can expose ILP and break
+  // cross iteration dependences for reductions.
+  if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
+      !(InterleaveSmallLoopScalarReduction && HasReductions && VF == 1))
     return 1;
 
   RegisterUsage R = calculateRegisterUsage({VF})[0];
@@ -5496,7 +5507,7 @@
 
   // Interleave if we vectorized this loop and there is a reduction that could
   // benefit from interleaving.
-  if (VF > 1 && !Legal->getReductionVars().empty()) {
+  if (VF > 1 && HasReductions) {
     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
     return IC;
   }
@@ -5508,7 +5519,11 @@
 
   // We want to interleave small loops in order to reduce the loop overhead and
   // potentially expose ILP opportunities.
-  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
+                    << "LV: IC is " << IC << '\n'
+                    << "LV: VF is " << VF << '\n');
+  const bool AggressivelyInterleaveReductions =
+      TTI.enableAggressiveInterleaving(HasReductions);
   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
     // We assume that the cost overhead is 1 and we use the cost model
     // to estimate the cost of the loop and interleave until the cost of the
@@ -5527,7 +5542,7 @@
     // by this point), we can increase the critical path length if the loop
     // we're interleaving is inside another loop. Limit, by default to 2, so the
     // critical path only gets increased by one reduction operation.
-    if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
+    if (HasReductions && TheLoop->getLoopDepth() > 1) {
       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
       SmallIC = std::min(SmallIC, F);
       StoresIC = std::min(StoresIC, F);
@@ -5541,14 +5556,23 @@
       return std::max(StoresIC, LoadsIC);
     }
 
-    LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
-    return SmallIC;
+    // If there are scalar reductions and TTI has enabled aggressive
+    // interleaving for reductions, we will interleave to expose ILP.
+    if (InterleaveSmallLoopScalarReduction && VF == 1 &&
+        AggressivelyInterleaveReductions) {
+      LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+      // Interleave no less than SmallIC but not as aggressive as the normal IC
+      // to satisfy the rare situation when resources are too limited.
+      return std::max(IC / 2, SmallIC);
+    } else {
+      LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+      return SmallIC;
+    }
   }
 
   // Interleave if this is a large loop (small loops are already dealt with by
   // this point) that could benefit from interleaving.
-  bool HasReductions = !Legal->getReductionVars().empty();
-  if (TTI.enableAggressiveInterleaving(HasReductions)) {
+  if (AggressivelyInterleaveReductions) {
     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
     return IC;
   }
Index: llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
@@ -0,0 +1,94 @@
+; RUN: opt < %s -loop-vectorize -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='loop-vectorize' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
+
+;void fun(Vector<double>       &MatrixB,
+;         const Vector<double> &MatrixA,
+;         const unsigned int * const start,
+;         const unsigned int * const end,
+;         const double * val) const
+;{
+;  const unsigned int N=MatrixB.size();
+;  MatrixB = MatrixA;
+;  for (unsigned int row=0; row<N; ++row)
+;  {
+;    double sum = 0;
+;    for (const unsigned int * col=start; col!=end; ++col, ++val)
+;      sum += *val * MatrixB(*col);
+;    MatrixB(row) -= sum;
+;  }
+;}
+
+; CHECK-LABEL: vector.body
+; CHECK: load double, double*
+; CHECK-NEXT: load double, double*
+; CHECK-NEXT: load double, double*
+; CHECK-NEXT: load double, double*
+
+; CHECK: fmul fast double
+; CHECK-NEXT: fmul fast double
+; CHECK-NEXT: fmul fast double
+; CHECK-NEXT: fmul fast double
+
+; CHECK: fadd fast double
+; CHECK-NEXT: fadd fast double
+; CHECK-NEXT: fadd fast double
+; CHECK-NEXT: fadd fast double
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%0 = type { %1, %8 }
+%1 = type { %2, i8, double, %4, %7* }
+%2 = type <{ i32 (...)**, %3, double*, i32 }>
+%3 = type { %7*, i8* }
+%4 = type { %5 }
+%5 = type { %6 }
+%6 = type { i32**, i32**, i32** }
+%7 = type <{ %8, i32, i32, i32, [4 x i8], i64, i32, [4 x i8], i64*, i32*, i8, i8, [6 x i8] }>
+%8 = type { i32 (...)**, i32, %9, %16* }
+%9 = type { %10 }
+%10 = type { %11 }
+%11 = type { %12, %14 }
+%12 = type { %13 }
+%13 = type { i8 }
+%14 = type { %15, i64 }
+%15 = type { i32, %15*, %15*, %15* }
+%16 = type { i32 (...)**, i8* }
+%17 = type { %8, i32, i32, double* }
+
+$test = comdat any
+define dso_local void @test(%0* %arg, %17* dereferenceable(88) %arg1) comdat align 2 {
+bb:
+  %tpm14 = getelementptr %0, %0* %arg, i64 0, i32 0, i32 3, i32 0, i32 0, i32 0
+  %tpm15 = load i32**, i32*** %tpm14, align 8
+  %tpm18 = getelementptr inbounds %17, %17* %arg1, i64 0, i32 3
+  %tpm19 = load double*, double** %tpm18, align 8
+  br label %bb22
+bb22:                                             ; preds = %bb33, %bb
+  %tpm26 = add i64 0, 1
+  %tpm27 = getelementptr inbounds i32, i32* null, i64 %tpm26
+  %tpm28 = getelementptr inbounds i32*, i32** %tpm15, i64 undef
+  %tpm29 = load i32*, i32** %tpm28, align 8
+  %tpm32 = getelementptr inbounds double, double* null, i64 %tpm26
+  br label %bb40
+bb33:                                             ; preds = %bb40
+  %tpm35 = getelementptr inbounds double, double* %tpm19, i64 undef
+  %tpm37 = fsub fast double 0.000000e+00, %tpm50
+  store double %tpm37, double* %tpm35, align 8
+  br label %bb22
+bb40:                                             ; preds = %bb40, %bb22
+  %tpm41 = phi i32* [ %tpm51, %bb40 ], [ %tpm27, %bb22 ]
+  %tpm42 = phi double* [ %tpm52, %bb40 ], [ %tpm32, %bb22 ]
+  %tpm43 = phi double [ %tpm50, %bb40 ], [ 0.000000e+00, %bb22 ]
+  %tpm44 = load double, double* %tpm42, align 8
+  %tpm45 = load i32, i32* %tpm41, align 4
+  %tpm46 = zext i32 %tpm45 to i64
+  %tpm47 = getelementptr inbounds double, double* %tpm19, i64 %tpm46
+  %tpm48 = load double, double* %tpm47, align 8
+  %tpm49 = fmul fast double %tpm48, %tpm44
+  %tpm50 = fadd fast double %tpm49, %tpm43
+  %tpm51 = getelementptr inbounds i32, i32* %tpm41, i64 1
+  %tpm52 = getelementptr inbounds double, double* %tpm42, i64 1
+  %tpm53 = icmp eq i32* %tpm51, %tpm29
+  br i1 %tpm53, label %bb33, label %bb40
+}