Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -247,6 +247,12 @@
     cl::desc(
         "Enable runtime interleaving until load/store ports are saturated"));
 
+/// Interleave small loops with scalar reductions.
+static cl::opt<bool> InterleaveSmallLoopScalarReduction(
+    "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
+    cl::desc("Enable interleaving for small loops with scalar reductions "
+             "to expose ILP."));
+
 /// The number of stores in a loop that are allowed to need predication.
 static cl::opt<unsigned> NumberOfStoresToPredicate(
     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -5252,10 +5258,18 @@
   if (Legal->getMaxSafeDepDistBytes() != -1U)
     return 1;
 
-  // Do not interleave loops with a relatively small known or estimated trip
-  // count.
   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
-  if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
+  const bool HasReductions = !Legal->getReductionVars().empty();
+  const bool ScalarReductionCond =
+      InterleaveSmallLoopScalarReduction && HasReductions && VF == 1;
+  // Do not interleave loops with a relatively small known or estimated trip
+  // count. But we will interleave when ScalarReductionCond is satisfied:
+  // i.e. InterleaveSmallLoopScalarReduction is enabled, and the code has
+  // scalar reductions(HasReductions && VF = 1), because with the above
+  // conditions interleaving can expose ILP, break cross iteration dependences
+  // for reductions, and will benefit SLP vectorizer in a later pass.
+  if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
+      !ScalarReductionCond)
     return 1;
 
   RegisterUsage R = calculateRegisterUsage({VF})[0];
@@ -5340,7 +5354,7 @@
 
   // Interleave if we vectorized this loop and there is a reduction that could
   // benefit from interleaving.
-  if (VF > 1 && !Legal->getReductionVars().empty()) {
+  if (VF > 1 && HasReductions) {
     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
     return IC;
   }
@@ -5352,7 +5366,11 @@
 
   // We want to interleave small loops in order to reduce the loop overhead and
   // potentially expose ILP opportunities.
-  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
+                    << "LV: IC is " << IC << '\n'
+                    << "LV: VF is " << VF << '\n');
+  const bool AggressivelyInterleaveReductions =
+      TTI.enableAggressiveInterleaving(HasReductions);
   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
     // We assume that the cost overhead is 1 and we use the cost model
     // to estimate the cost of the loop and interleave until the cost of the
@@ -5371,7 +5389,7 @@
     // by this point), we can increase the critical path length if the loop
     // we're interleaving is inside another loop. Limit, by default to 2, so the
     // critical path only gets increased by one reduction operation.
-    if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
+    if (HasReductions && TheLoop->getLoopDepth() > 1) {
       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
       SmallIC = std::min(SmallIC, F);
       StoresIC = std::min(StoresIC, F);
@@ -5385,14 +5403,23 @@
       return std::max(StoresIC, LoadsIC);
     }
 
-    LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
-    return SmallIC;
+    // If there are scalar reductions and TTI has enabled aggressive
+    // interleaving for reductions, we will interleave to expose ILP.
+    if (InterleaveSmallLoopScalarReduction && VF == 1 &&
+        AggressivelyInterleaveReductions) {
+      LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+      // Interleave no less than SmallIC but not as aggressive as the normal IC
+      // to satisfy the rare situation when resources are too limited.
+      return std::max(IC / 2, SmallIC);
+    } else {
+      LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+      return SmallIC;
+    }
   }
 
   // Interleave if this is a large loop (small loops are already dealt with by
   // this point) that could benefit from interleaving.
-  bool HasReductions = !Legal->getReductionVars().empty();
-  if (TTI.enableAggressiveInterleaving(HasReductions)) {
+  if (AggressivelyInterleaveReductions) {
     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
     return IC;
   }
Index: llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
@@ -0,0 +1,93 @@
+; RUN: opt < %s -loop-vectorize -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='loop-vectorize' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
+
+;void fun(Vector<double>       &MatrixB,
+;         const Vector<double> &MatrixA,
+;         const unsigned int * const start,
+;         const unsigned int * const end,
+;         const double * val) const
+;{
+;  const unsigned int N=MatrixB.size();
+;  MatrixB = MatrixA;
+;  for (unsigned int row=0; row<N; ++row)
+;  {
+;    double sum = 0;
+;    for (const unsigned int * col=start; col!=end; ++col, ++val)
+;      sum += *val * MatrixB(*col);
+;    MatrixB(row) -= sum;
+;  }
+;}
+
+; CHECK-LABEL: vector.body
+; CHECK: load double, double*
+; CHECK-NEXT: load double, double*
+; CHECK-NEXT: load double, double*
+; CHECK-NEXT: load double, double*
+
+; CHECK: fmul fast double
+; CHECK-NEXT: fmul fast double
+; CHECK-NEXT: fmul fast double
+; CHECK-NEXT: fmul fast double
+
+; CHECK: fadd fast double
+; CHECK-NEXT: fadd fast double
+; CHECK-NEXT: fadd fast double
+; CHECK-NEXT: fadd fast double
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%0 = type { %1, %8 }
+%1 = type { %2, i8, double, %4, %7* }
+%2 = type <{ i32 (...)**, %3, double*, i32 }>
+%3 = type { %7*, i8* }
+%4 = type { %5 }
+%5 = type { %6 }
+%6 = type { i32**, i32**, i32** }
+%7 = type <{ %8, i32, i32, i32, [4 x i8], i64, i32, [4 x i8], i64*, i32*, i8, i8, [6 x i8] }>
+%8 = type { i32 (...)**, i32, %9, %16* }
+%9 = type { %10 }
+%10 = type { %11 }
+%11 = type { %12, %14 }
+%12 = type { %13 }
+%13 = type { i8 }
+%14 = type { %15, i64 }
+%15 = type { i32, %15*, %15*, %15* }
+%16 = type { i32 (...)**, i8* }
+%17 = type { %8, i32, i32, double* }
+
+$test = comdat any
+define dso_local void @test(%0* %arg, %17* dereferenceable(88) %arg1) comdat align 2 {
+  %tpm14 = getelementptr %0, %0* %arg, i64 0, i32 0, i32 3, i32 0, i32 0, i32 0
+  %tpm15 = load i32**, i32*** %tpm14, align 8
+  %tpm18 = getelementptr inbounds %17, %17* %arg1, i64 0, i32 3
+  %tpm19 = load double*, double** %tpm18, align 8
+  br label %bb22
+bb22:                                             ; preds = %bb33, %bb
+  %tpm26 = add i64 0, 1
+  %tpm27 = getelementptr inbounds i32, i32* null, i64 %tpm26
+  %tpm28 = getelementptr inbounds i32*, i32** %tpm15, i64 undef
+  %tpm29 = load i32*, i32** %tpm28, align 8
+  %tpm32 = getelementptr inbounds double, double* null, i64 %tpm26
+  br label %bb40
+bb33:                                             ; preds = %bb40
+  %tpm35 = getelementptr inbounds double, double* %tpm19, i64 undef
+  %tpm37 = fsub fast double 0.000000e+00, %tpm50
+  store double %tpm37, double* %tpm35, align 8
+  br label %bb22
+bb40:                                             ; preds = %bb40, %bb22
+  %tpm41 = phi i32* [ %tpm51, %bb40 ], [ %tpm27, %bb22 ]
+  %tpm42 = phi double* [ %tpm52, %bb40 ], [ %tpm32, %bb22 ]
+  %tpm43 = phi double [ %tpm50, %bb40 ], [ 0.000000e+00, %bb22 ]
+  %tpm44 = load double, double* %tpm42, align 8
+  %tpm45 = load i32, i32* %tpm41, align 4
+  %tpm46 = zext i32 %tpm45 to i64
+  %tpm47 = getelementptr inbounds double, double* %tpm19, i64 %tpm46
+  %tpm48 = load double, double* %tpm47, align 8
+  %tpm49 = fmul fast double %tpm48, %tpm44
+  %tpm50 = fadd fast double %tpm49, %tpm43
+  %tpm51 = getelementptr inbounds i32, i32* %tpm41, i64 1
+  %tpm52 = getelementptr inbounds double, double* %tpm42, i64 1
+  %tpm53 = icmp eq i32* %tpm51, %tpm29
+  br i1 %tpm53, label %bb33, label %bb40
+}
Index: llvm/test/Transforms/PhaseOrdering/interleave_LV_SLP.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/PhaseOrdering/interleave_LV_SLP.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: powerpc-registered-target
+; RUN: opt < %s -O2 -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='default<O2>' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
+
+;void fun(Vector<double>       &MatrixB,
+;         const Vector<double> &MatrixA,
+;         const unsigned int * const start,
+;         const unsigned int * const end,
+;         const double * val) const
+;{
+;  const unsigned int N=MatrixB.size();
+;  MatrixB = MatrixA;
+;  for (unsigned int row=0; row<N; ++row)
+;  {
+;    double sum = 0;
+;    for (const unsigned int * col=start; col!=end; ++col, ++val)
+;      sum += *val * MatrixB(*col);
+;    MatrixB(row) -= sum;
+;  }
+;}
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%0 = type { %1, %8 }
+%1 = type { %2, i8, double, %4, %7* }
+%2 = type <{ i32 (...)**, %3, double*, i32 }>
+%3 = type { %7*, i8* }
+%4 = type { %5 }
+%5 = type { %6 }
+%6 = type { i32**, i32**, i32** }
+%7 = type <{ %8, i32, i32, i32, [4 x i8], i64, i32, [4 x i8], i64*, i32*, i8, i8, [6 x i8] }>
+%8 = type { i32 (...)**, i32, %9, %16* }
+%9 = type { %10 }
+%10 = type { %11 }
+%11 = type { %12, %14 }
+%12 = type { %13 }
+%13 = type { i8 }
+%14 = type { %15, i64 }
+%15 = type { i32, %15*, %15*, %15* }
+%16 = type { i32 (...)**, i8* }
+%17 = type { %8, i32, i32, double* }
+
+$test = comdat any
+define dso_local void @test(%0* %arg, %17* dereferenceable(88) %arg1) comdat align 2 {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[TPM14:%.*]] = getelementptr [[TMP0:%.*]], %0* [[ARG:%.*]], i64 0, i32 0, i32 3, i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[TPM15:%.*]] = load i32**, i32*** [[TPM14]], align 8
+; CHECK-NEXT:    [[TPM18:%.*]] = getelementptr inbounds [[TMP17:%.*]], %17* [[ARG1:%.*]], i64 0, i32 3
+; CHECK-NEXT:    [[TPM19:%.*]] = load double*, double** [[TPM18]], align 8
+; CHECK-NEXT:    [[TPM28:%.*]] = getelementptr inbounds i32*, i32** [[TPM15]], i64 undef
+; CHECK-NEXT:    [[TPM35:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 undef
+; CHECK-NEXT:    br label [[BB22:%.*]]
+; CHECK:       bb22:
+; CHECK-NEXT:    [[TPM29:%.*]] = load i32*, i32** [[TPM28]], align 8
+; CHECK-NEXT:    [[UGLYGEP17:%.*]] = getelementptr i32, i32* [[TPM29]], i64 -2
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = ptrtoint i32* [[UGLYGEP17]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[UGLYGEP2]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32* [[UGLYGEP17]], inttoptr (i64 12 to i32*)
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[BB40_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       bb40.preheader:
+; CHECK-NEXT:    [[TPM41_PH:%.*]] = phi i32* [ inttoptr (i64 4 to i32*), [[BB22]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; CHECK-NEXT:    [[TPM42_PH:%.*]] = phi double* [ inttoptr (i64 8 to double*), [[BB22]] ], [ [[IND_END4:%.*]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TPM43_PH:%.*]] = phi double [ 0.000000e+00, [[BB22]] ], [ [[BIN_RDX16:%.*]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[BB40:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
+; CHECK-NEXT:    [[IND_END]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END4]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP6]]
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[NEXT_GEP8]] to <4 x double>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x double>, <4 x double>* [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[NEXT_GEP]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[NEXT_GEP5]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[NEXT_GEP6]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[NEXT_GEP7]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP17]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, double* [[TMP17]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, double* [[TMP18]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load double, double* [[TMP19]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, double* [[TMP20]], align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x double> undef, double [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x double> [[TMP25]], double [[TMP22]], i32 1
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> [[TMP26]], double [[TMP23]], i32 2
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x double> [[TMP27]], double [[TMP24]], i32 3
+; CHECK-NEXT:    [[TMP29:%.*]] = fmul fast <4 x double> [[TMP28]], [[TMP8]]
+; CHECK-NEXT:    [[TMP30]] = fadd fast <4 x double> [[TMP29]], [[TMP3]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x double> [[TMP30]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x double> [[TMP30]], i32 1
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast double [[TMP33]], [[TMP32]]
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x double> [[TMP30]], i32 2
+; CHECK-NEXT:    [[BIN_RDX15:%.*]] = fadd fast double [[TMP34]], [[BIN_RDX]]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x double> [[TMP30]], i32 3
+; CHECK-NEXT:    [[BIN_RDX16]] = fadd fast double [[TMP35]], [[BIN_RDX15]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[BB33:%.*]], label [[BB40_PREHEADER]]
+; CHECK:       bb33:
+; CHECK-NEXT:    [[TPM50_LCSSA:%.*]] = phi double [ [[BIN_RDX16]], [[MIDDLE_BLOCK]] ], [ [[TPM50:%.*]], [[BB40]] ]
+; CHECK-NEXT:    [[TPM37:%.*]] = fneg fast double [[TPM50_LCSSA]]
+; CHECK-NEXT:    store double [[TPM37]], double* [[TPM35]], align 8
+; CHECK-NEXT:    br label [[BB22]]
+; CHECK:       bb40:
+; CHECK-NEXT:    [[TPM41:%.*]] = phi i32* [ [[TPM51:%.*]], [[BB40]] ], [ [[TPM41_PH]], [[BB40_PREHEADER]] ]
+; CHECK-NEXT:    [[TPM42:%.*]] = phi double* [ [[TPM52:%.*]], [[BB40]] ], [ [[TPM42_PH]], [[BB40_PREHEADER]] ]
+; CHECK-NEXT:    [[TPM43:%.*]] = phi double [ [[TPM50]], [[BB40]] ], [ [[TPM43_PH]], [[BB40_PREHEADER]] ]
+; CHECK-NEXT:    [[TPM44:%.*]] = load double, double* [[TPM42]], align 8
+; CHECK-NEXT:    [[TPM45:%.*]] = load i32, i32* [[TPM41]], align 4
+; CHECK-NEXT:    [[TPM46:%.*]] = zext i32 [[TPM45]] to i64
+; CHECK-NEXT:    [[TPM47:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TPM46]]
+; CHECK-NEXT:    [[TPM48:%.*]] = load double, double* [[TPM47]], align 8
+; CHECK-NEXT:    [[TPM49:%.*]] = fmul fast double [[TPM48]], [[TPM44]]
+; CHECK-NEXT:    [[TPM50]] = fadd fast double [[TPM49]], [[TPM43]]
+; CHECK-NEXT:    [[TPM51]] = getelementptr inbounds i32, i32* [[TPM41]], i64 1
+; CHECK-NEXT:    [[TPM52]] = getelementptr inbounds double, double* [[TPM42]], i64 1
+; CHECK-NEXT:    [[TPM53:%.*]] = icmp eq i32* [[TPM51]], [[TPM29]]
+; CHECK-NEXT:    br i1 [[TPM53]], label [[BB33]], label [[BB40]], !llvm.loop !2
+;
+  %tpm14 = getelementptr %0, %0* %arg, i64 0, i32 0, i32 3, i32 0, i32 0, i32 0
+  %tpm15 = load i32**, i32*** %tpm14, align 8
+  %tpm18 = getelementptr inbounds %17, %17* %arg1, i64 0, i32 3
+  %tpm19 = load double*, double** %tpm18, align 8
+  br label %bb22
+bb22:                                             ; preds = %bb33, %bb
+  %tpm26 = add i64 0, 1
+  %tpm27 = getelementptr inbounds i32, i32* null, i64 %tpm26
+  %tpm28 = getelementptr inbounds i32*, i32** %tpm15, i64 undef
+  %tpm29 = load i32*, i32** %tpm28, align 8
+  %tpm32 = getelementptr inbounds double, double* null, i64 %tpm26
+  br label %bb40
+bb33:                                             ; preds = %bb40
+  %tpm35 = getelementptr inbounds double, double* %tpm19, i64 undef
+  %tpm37 = fsub fast double 0.000000e+00, %tpm50
+  store double %tpm37, double* %tpm35, align 8
+  br label %bb22
+bb40:                                             ; preds = %bb40, %bb22
+  %tpm41 = phi i32* [ %tpm51, %bb40 ], [ %tpm27, %bb22 ]
+  %tpm42 = phi double* [ %tpm52, %bb40 ], [ %tpm32, %bb22 ]
+  %tpm43 = phi double [ %tpm50, %bb40 ], [ 0.000000e+00, %bb22 ]
+  %tpm44 = load double, double* %tpm42, align 8
+  %tpm45 = load i32, i32* %tpm41, align 4
+  %tpm46 = zext i32 %tpm45 to i64
+  %tpm47 = getelementptr inbounds double, double* %tpm19, i64 %tpm46
+  %tpm48 = load double, double* %tpm47, align 8
+  %tpm49 = fmul fast double %tpm48, %tpm44
+  %tpm50 = fadd fast double %tpm49, %tpm43
+  %tpm51 = getelementptr inbounds i32, i32* %tpm41, i64 1
+  %tpm52 = getelementptr inbounds double, double* %tpm42, i64 1
+  %tpm53 = icmp eq i32* %tpm51, %tpm29
+  br i1 %tpm53, label %bb33, label %bb40
+}
Index: llvm/test/Transforms/PhaseOrdering/interleave_LV_SLP_false.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/PhaseOrdering/interleave_LV_SLP_false.ll
@@ -0,0 +1,220 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: powerpc-registered-target
+; RUN: opt < %s -O2 -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=false 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='default<O2>' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=false 2>&1 | FileCheck %s
+
+;void fun(Vector<double>       &MatrixB,
+;         const Vector<double> &MatrixA,
+;         const unsigned int * const start,
+;         const unsigned int * const end,
+;         const double * val) const
+;{
+;  const unsigned int N=MatrixB.size();
+;  MatrixB = MatrixA;
+;  for (unsigned int row=0; row<N; ++row)
+;  {
+;    double sum = 0;
+;    for (const unsigned int * col=start; col!=end; ++col, ++val)
+;      sum += *val * MatrixB(*col);
+;    MatrixB(row) -= sum;
+;  }
+;}
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%0 = type { %1, %8 }
+%1 = type { %2, i8, double, %4, %7* }
+%2 = type <{ i32 (...)**, %3, double*, i32 }>
+%3 = type { %7*, i8* }
+%4 = type { %5 }
+%5 = type { %6 }
+%6 = type { i32**, i32**, i32** }
+%7 = type <{ %8, i32, i32, i32, [4 x i8], i64, i32, [4 x i8], i64*, i32*, i8, i8, [6 x i8] }>
+%8 = type { i32 (...)**, i32, %9, %16* }
+%9 = type { %10 }
+%10 = type { %11 }
+%11 = type { %12, %14 }
+%12 = type { %13 }
+%13 = type { i8 }
+%14 = type { %15, i64 }
+%15 = type { i32, %15*, %15*, %15* }
+%16 = type { i32 (...)**, i8* }
+%17 = type { %8, i32, i32, double* }
+
+$test = comdat any
+define dso_local void @test(%0* %arg, %17* dereferenceable(88) %arg1) comdat align 2 {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[TPM14:%.*]] = getelementptr [[TMP0:%.*]], %0* [[ARG:%.*]], i64 0, i32 0, i32 3, i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[TPM15:%.*]] = load i32**, i32*** [[TPM14]], align 8
+; CHECK-NEXT:    [[TPM18:%.*]] = getelementptr inbounds [[TMP17:%.*]], %17* [[ARG1:%.*]], i64 0, i32 3
+; CHECK-NEXT:    [[TPM19:%.*]] = load double*, double** [[TPM18]], align 8
+; CHECK-NEXT:    [[TPM28:%.*]] = getelementptr inbounds i32*, i32** [[TPM15]], i64 undef
+; CHECK-NEXT:    [[TPM35:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 undef
+; CHECK-NEXT:    br label [[BB22:%.*]]
+; CHECK:       bb22:
+; CHECK-NEXT:    [[TPM29:%.*]] = load i32*, i32** [[TPM28]], align 8
+; CHECK-NEXT:    [[UGLYGEP9:%.*]] = getelementptr i32, i32* [[TPM29]], i64 -2
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = ptrtoint i32* [[UGLYGEP9]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[UGLYGEP2]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[BB40_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       bb40.preheader:
+; CHECK-NEXT:    [[TPM41_PH:%.*]] = phi i32* [ inttoptr (i64 4 to i32*), [[BB22]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; CHECK-NEXT:    [[TPM42_PH:%.*]] = phi double* [ inttoptr (i64 8 to double*), [[BB22]] ], [ [[IND_END4:%.*]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TPM43_PH:%.*]] = phi double [ 0.000000e+00, [[BB22]] ], [ [[BIN_RDX:%.*]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[BB40:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775806
+; CHECK-NEXT:    [[IND_END]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END4]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[N_VEC]], -2
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr exact i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
+; CHECK:       vector.ph.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP5]], -2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH_NEW]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH_NEW]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP7]]
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, double* [[NEXT_GEP6]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, double* [[NEXT_GEP7]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[NEXT_GEP]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[NEXT_GEP5]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP17]] = load double, double* [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, double* [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast double [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul fast double [[TMP18]], [[TMP10]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fadd fast double [[TMP19]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd fast double [[TMP20]], [[VEC_PHI8]]
+; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP_1:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP5_1:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP23]]
+; CHECK-NEXT:    [[NEXT_GEP6_1:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP7_1:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[TMP24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, double* [[NEXT_GEP6_1]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = load double, double* [[NEXT_GEP7_1]], align 8
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[NEXT_GEP_1]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* [[NEXT_GEP5_1]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP27]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = zext i32 [[TMP28]] to i64
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load double, double* [[TMP31]], align 8
+; CHECK-NEXT:    [[TMP34:%.*]] = load double, double* [[TMP32]], align 8
+; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast double [[TMP33]], [[TMP25]]
+; CHECK-NEXT:    [[TMP36:%.*]] = fmul fast double [[TMP34]], [[TMP26]]
+; CHECK-NEXT:    [[TMP37]] = fadd fast double [[TMP35]], [[TMP21]]
+; CHECK-NEXT:    [[TMP38]] = fadd fast double [[TMP36]], [[TMP22]]
+; CHECK-NEXT:    [[INDEX_NEXT_1]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[NITER_NSUB_1]] = add i64 [[NITER]], -2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NSUB_1]], 0
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block.unr-lcssa:
+; CHECK-NEXT:    [[DOTLCSSA10_PH:%.*]] = phi double [ undef, [[VECTOR_PH]] ], [ [[TMP37]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOTLCSSA_PH:%.*]] = phi double [ undef, [[VECTOR_PH]] ], [ [[TMP38]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_1]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI_UNR:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP37]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI8_UNR:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP38]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp eq i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[MIDDLE_BLOCK]], label [[MIDDLE_BLOCK_EPILOG_LCSSA:%.*]]
+; CHECK:       middle.block.epilog-lcssa:
+; CHECK-NEXT:    [[TMP39:%.*]] = or i64 [[INDEX_UNR]], 1
+; CHECK-NEXT:    [[NEXT_GEP5_EPIL:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i32, i32* [[NEXT_GEP5_EPIL]], align 4
+; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP40]] to i64
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load double, double* [[TMP42]], align 8
+; CHECK-NEXT:    [[TMP44:%.*]] = or i64 [[INDEX_UNR]], 1
+; CHECK-NEXT:    [[NEXT_GEP7_EPIL:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[TMP44]]
+; CHECK-NEXT:    [[TMP45:%.*]] = load double, double* [[NEXT_GEP7_EPIL]], align 8
+; CHECK-NEXT:    [[TMP46:%.*]] = fmul fast double [[TMP43]], [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = fadd fast double [[TMP46]], [[VEC_PHI8_UNR]]
+; CHECK-NEXT:    [[NEXT_GEP_EPIL:%.*]] = getelementptr i32, i32* inttoptr (i64 4 to i32*), i64 [[INDEX_UNR]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i32, i32* [[NEXT_GEP_EPIL]], align 4
+; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP48]] to i64
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TMP49]]
+; CHECK-NEXT:    [[TMP51:%.*]] = load double, double* [[TMP50]], align 8
+; CHECK-NEXT:    [[NEXT_GEP6_EPIL:%.*]] = getelementptr double, double* inttoptr (i64 8 to double*), i64 [[INDEX_UNR]]
+; CHECK-NEXT:    [[TMP52:%.*]] = load double, double* [[NEXT_GEP6_EPIL]], align 8
+; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast double [[TMP51]], [[TMP52]]
+; CHECK-NEXT:    [[TMP54:%.*]] = fadd fast double [[TMP53]], [[VEC_PHI_UNR]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[DOTLCSSA10:%.*]] = phi double [ [[DOTLCSSA10_PH]], [[MIDDLE_BLOCK_UNR_LCSSA]] ], [ [[TMP54]], [[MIDDLE_BLOCK_EPILOG_LCSSA]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi double [ [[DOTLCSSA_PH]], [[MIDDLE_BLOCK_UNR_LCSSA]] ], [ [[TMP47]], [[MIDDLE_BLOCK_EPILOG_LCSSA]] ]
+; CHECK-NEXT:    [[BIN_RDX]] = fadd fast double [[DOTLCSSA]], [[DOTLCSSA10]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[BB33:%.*]], label [[BB40_PREHEADER]]
+; CHECK:       bb33:
+; CHECK-NEXT:    [[TPM50_LCSSA:%.*]] = phi double [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[TPM50:%.*]], [[BB40]] ]
+; CHECK-NEXT:    [[TPM37:%.*]] = fneg fast double [[TPM50_LCSSA]]
+; CHECK-NEXT:    store double [[TPM37]], double* [[TPM35]], align 8
+; CHECK-NEXT:    br label [[BB22]]
+; CHECK:       bb40:
+; CHECK-NEXT:    [[TPM41:%.*]] = phi i32* [ [[TPM51:%.*]], [[BB40]] ], [ [[TPM41_PH]], [[BB40_PREHEADER]] ]
+; CHECK-NEXT:    [[TPM42:%.*]] = phi double* [ [[TPM52:%.*]], [[BB40]] ], [ [[TPM42_PH]], [[BB40_PREHEADER]] ]
+; CHECK-NEXT:    [[TPM43:%.*]] = phi double [ [[TPM50]], [[BB40]] ], [ [[TPM43_PH]], [[BB40_PREHEADER]] ]
+; CHECK-NEXT:    [[TPM44:%.*]] = load double, double* [[TPM42]], align 8
+; CHECK-NEXT:    [[TPM45:%.*]] = load i32, i32* [[TPM41]], align 4
+; CHECK-NEXT:    [[TPM46:%.*]] = zext i32 [[TPM45]] to i64
+; CHECK-NEXT:    [[TPM47:%.*]] = getelementptr inbounds double, double* [[TPM19]], i64 [[TPM46]]
+; CHECK-NEXT:    [[TPM48:%.*]] = load double, double* [[TPM47]], align 8
+; CHECK-NEXT:    [[TPM49:%.*]] = fmul fast double [[TPM48]], [[TPM44]]
+; CHECK-NEXT:    [[TPM50]] = fadd fast double [[TPM49]], [[TPM43]]
+; CHECK-NEXT:    [[TPM51]] = getelementptr inbounds i32, i32* [[TPM41]], i64 1
+; CHECK-NEXT:    [[TPM52]] = getelementptr inbounds double, double* [[TPM42]], i64 1
+; CHECK-NEXT:    [[TPM53:%.*]] = icmp eq i32* [[TPM51]], [[TPM29]]
+; CHECK-NEXT:    br i1 [[TPM53]], label [[BB33]], label [[BB40]], !llvm.loop !2
+;
+  %tpm14 = getelementptr %0, %0* %arg, i64 0, i32 0, i32 3, i32 0, i32 0, i32 0
+  %tpm15 = load i32**, i32*** %tpm14, align 8
+  %tpm18 = getelementptr inbounds %17, %17* %arg1, i64 0, i32 3
+  %tpm19 = load double*, double** %tpm18, align 8
+  br label %bb22
+bb22:                                             ; preds = %bb33, %bb
+  %tpm26 = add i64 0, 1
+  %tpm27 = getelementptr inbounds i32, i32* null, i64 %tpm26
+  %tpm28 = getelementptr inbounds i32*, i32** %tpm15, i64 undef
+  %tpm29 = load i32*, i32** %tpm28, align 8
+  %tpm32 = getelementptr inbounds double, double* null, i64 %tpm26
+  br label %bb40
+bb33:                                             ; preds = %bb40
+  %tpm35 = getelementptr inbounds double, double* %tpm19, i64 undef
+  %tpm37 = fsub fast double 0.000000e+00, %tpm50
+  store double %tpm37, double* %tpm35, align 8
+  br label %bb22
+bb40:                                             ; preds = %bb40, %bb22
+  %tpm41 = phi i32* [ %tpm51, %bb40 ], [ %tpm27, %bb22 ]
+  %tpm42 = phi double* [ %tpm52, %bb40 ], [ %tpm32, %bb22 ]
+  %tpm43 = phi double [ %tpm50, %bb40 ], [ 0.000000e+00, %bb22 ]
+  %tpm44 = load double, double* %tpm42, align 8
+  %tpm45 = load i32, i32* %tpm41, align 4
+  %tpm46 = zext i32 %tpm45 to i64
+  %tpm47 = getelementptr inbounds double, double* %tpm19, i64 %tpm46
+  %tpm48 = load double, double* %tpm47, align 8
+  %tpm49 = fmul fast double %tpm48, %tpm44
+  %tpm50 = fadd fast double %tpm49, %tpm43
+  %tpm51 = getelementptr inbounds i32, i32* %tpm41, i64 1
+  %tpm52 = getelementptr inbounds double, double* %tpm42, i64 1
+  %tpm53 = icmp eq i32* %tpm51, %tpm29
+  br i1 %tpm53, label %bb33, label %bb40
+}
Index: llvm/test/Transforms/SLPVectorizer/PowerPC/interleave_SLP.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/SLPVectorizer/PowerPC/interleave_SLP.ll
@@ -0,0 +1,213 @@
+; RUN: opt -S -mcpu=pwr9 -slp-vectorizer -interleave-small-loop-scalar-reduction=true < %s | FileCheck %s
+; RUN: opt -S -mcpu=pwr9 -passes='slp-vectorizer' -interleave-small-loop-scalar-reduction=true < %s | FileCheck %s
+
+; CHECK-LABEL: vector.body
+
+; CHECK: load <4 x double>, <4 x double>*
+
+; CHECK: fmul fast <4 x double>
+
+; CHECK: fadd fast <4 x double>
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux"
+
+%0 = type { i8 }
+%1 = type { %2, %9 }
+%2 = type { %3, i8, double, %5, %8* }
+%3 = type <{ i32 (...)**, %4, double*, i32 }>
+%4 = type { %8*, i8* }
+%5 = type { %6 }
+%6 = type { %7 }
+%7 = type { i32**, i32**, i32** }
+%8 = type <{ %9, i32, i32, i32, [4 x i8], i64, i32, [4 x i8], i64*, i32*, i8, i8, [6 x i8] }>
+%9 = type { i32 (...)**, i32, %10, %17* }
+%10 = type { %11 }
+%11 = type { %12 }
+%12 = type { %13, %15 }
+%13 = type { %14 }
+%14 = type { i8 }
+%15 = type { %16, i64 }
+%16 = type { i32, %16*, %16*, %16* }
+%17 = type { i32 (...)**, i8* }
+%18 = type { %9, i32, i32, double* }
+%19 = type <{ i32 (...)**, %4, double*, i32, [4 x i8], %9 }>
+
+$test0 = comdat any
+
+@0 = internal global %0 zeroinitializer, align 1
+@__dso_handle = external hidden global i8
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @1, i8* null }]
+declare void @test3(%0*)
+declare void @test4(%0*)
+; Function Attrs: nofree nounwind
+declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*)
+define weak_odr dso_local void @test0(%1* %arg, %18* dereferenceable(88) %arg1, %18* dereferenceable(88) %arg2) local_unnamed_addr comdat align 2 {
+bb:
+  %tpm = getelementptr inbounds %18, %18* %arg1, i64 0, i32 1
+  %tpm3 = load i32, i32* %tpm, align 8
+  %tpm4 = bitcast %1* %arg to %19*
+  %tpm5 = tail call dereferenceable(128) %8* @test1(%19* %tpm4)
+  %tpm6 = getelementptr inbounds %8, %8* %tpm5, i64 0, i32 8
+  %tpm7 = load i64*, i64** %tpm6, align 8
+  %tpm8 = tail call dereferenceable(128) %8* @test1(%19* %tpm4)
+  %tpm9 = getelementptr inbounds %8, %8* %tpm8, i64 0, i32 9
+  %tpm10 = load i32*, i32** %tpm9, align 8
+  %tpm102 = ptrtoint i32* %tpm10 to i64
+  %tpm11 = tail call dereferenceable(88) %18* @test2(%18* nonnull %arg1, %18* nonnull dereferenceable(88) %arg2)
+  %tpm12 = icmp eq i32 %tpm3, 0
+  br i1 %tpm12, label %bb21, label %bb13
+
+bb13:                                             ; preds = %bb
+  %tpm14 = getelementptr %1, %1* %arg, i64 0, i32 0, i32 3, i32 0, i32 0, i32 0
+  %tpm15 = load i32**, i32*** %tpm14, align 8
+  %tpm16 = getelementptr inbounds %1, %1* %arg, i64 0, i32 0, i32 0, i32 2
+  %tpm17 = load double*, double** %tpm16, align 8
+  %tpm18 = getelementptr inbounds %18, %18* %arg1, i64 0, i32 3
+  %tpm19 = load double*, double** %tpm18, align 8
+  %tpm20 = zext i32 %tpm3 to i64
+  %0 = sub i64 0, %tpm102
+  br label %bb22
+
+bb21.loopexit:                                    ; preds = %bb33
+  br label %bb21
+
+bb21:                                             ; preds = %bb21.loopexit, %bb
+  ret void
+
+bb22:                                             ; preds = %bb33, %bb13
+  %tpm23 = phi i64 [ 0, %bb13 ], [ %tpm38, %bb33 ]
+  %tpm24 = getelementptr inbounds i64, i64* %tpm7, i64 %tpm23
+  %tpm25 = load i64, i64* %tpm24, align 8
+  %tpm26 = add i64 %tpm25, 1
+  %tpm27 = getelementptr inbounds i32, i32* %tpm10, i64 %tpm26
+  %tpm28 = getelementptr inbounds i32*, i32** %tpm15, i64 %tpm23
+  %tpm29 = load i32*, i32** %tpm28, align 8
+  %tpm30 = icmp eq i32* %tpm27, %tpm29
+  br i1 %tpm30, label %bb33, label %bb31
+
+bb31:                                             ; preds = %bb22
+  %tpm32 = getelementptr inbounds double, double* %tpm17, i64 %tpm26
+  %scevgep = getelementptr i32, i32* %tpm29, i64 -2
+  %scevgep1 = bitcast i32* %scevgep to i8*
+  %uglygep = getelementptr i8, i8* %scevgep1, i64 %0
+  %1 = mul i64 %tpm25, -4
+  %scevgep3 = getelementptr i8, i8* %uglygep, i64 %1
+  %scevgep34 = ptrtoint i8* %scevgep3 to i64
+  %2 = lshr i64 %scevgep34, 2
+  %3 = add nuw nsw i64 %2, 1
+  %min.iters.check = icmp ult i64 %3, 4
+  br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %bb31
+  %n.mod.vf = urem i64 %3, 4
+  %n.vec = sub i64 %3, %n.mod.vf
+  %ind.end = getelementptr i32, i32* %tpm27, i64 %n.vec
+  %ind.end6 = getelementptr double, double* %tpm32, i64 %n.vec
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi double [ 0.000000e+00, %vector.ph ], [ %36, %vector.body ]
+  %vec.phi14 = phi double [ 0.000000e+00, %vector.ph ], [ %37, %vector.body ]
+  %vec.phi15 = phi double [ 0.000000e+00, %vector.ph ], [ %38, %vector.body ]
+  %vec.phi16 = phi double [ 0.000000e+00, %vector.ph ], [ %39, %vector.body ]
+  %4 = add i64 %index, 0
+  %next.gep = getelementptr i32, i32* %tpm27, i64 %4
+  %5 = add i64 %index, 1
+  %next.gep7 = getelementptr i32, i32* %tpm27, i64 %5
+  %6 = add i64 %index, 2
+  %next.gep8 = getelementptr i32, i32* %tpm27, i64 %6
+  %7 = add i64 %index, 3
+  %next.gep9 = getelementptr i32, i32* %tpm27, i64 %7
+  %8 = add i64 %index, 0
+  %next.gep10 = getelementptr double, double* %tpm32, i64 %8
+  %9 = add i64 %index, 1
+  %next.gep11 = getelementptr double, double* %tpm32, i64 %9
+  %10 = add i64 %index, 2
+  %next.gep12 = getelementptr double, double* %tpm32, i64 %10
+  %11 = add i64 %index, 3
+  %next.gep13 = getelementptr double, double* %tpm32, i64 %11
+  %12 = load double, double* %next.gep10, align 8
+  %13 = load double, double* %next.gep11, align 8
+  %14 = load double, double* %next.gep12, align 8
+  %15 = load double, double* %next.gep13, align 8
+  %16 = load i32, i32* %next.gep, align 4
+  %17 = load i32, i32* %next.gep7, align 4
+  %18 = load i32, i32* %next.gep8, align 4
+  %19 = load i32, i32* %next.gep9, align 4
+  %20 = zext i32 %16 to i64
+  %21 = zext i32 %17 to i64
+  %22 = zext i32 %18 to i64
+  %23 = zext i32 %19 to i64
+  %24 = getelementptr inbounds double, double* %tpm19, i64 %20
+  %25 = getelementptr inbounds double, double* %tpm19, i64 %21
+  %26 = getelementptr inbounds double, double* %tpm19, i64 %22
+  %27 = getelementptr inbounds double, double* %tpm19, i64 %23
+  %28 = load double, double* %24, align 8
+  %29 = load double, double* %25, align 8
+  %30 = load double, double* %26, align 8
+  %31 = load double, double* %27, align 8
+  %32 = fmul fast double %28, %12
+  %33 = fmul fast double %29, %13
+  %34 = fmul fast double %30, %14
+  %35 = fmul fast double %31, %15
+  %36 = fadd fast double %32, %vec.phi
+  %37 = fadd fast double %33, %vec.phi14
+  %38 = fadd fast double %34, %vec.phi15
+  %39 = fadd fast double %35, %vec.phi16
+  %index.next = add i64 %index, 4
+  %40 = icmp eq i64 %index.next, %n.vec
+  br i1 %40, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = fadd fast double %37, %36
+  %bin.rdx17 = fadd fast double %38, %bin.rdx
+  %bin.rdx18 = fadd fast double %39, %bin.rdx17
+  %cmp.n = icmp eq i64 %3, %n.vec
+  br i1 %cmp.n, label %bb33.loopexit, label %scalar.ph
+
+scalar.ph:                                        ; preds = %middle.block, %bb31
+  %bc.resume.val = phi i32* [ %ind.end, %middle.block ], [ %tpm27, %bb31 ]
+  %bc.resume.val5 = phi double* [ %ind.end6, %middle.block ], [ %tpm32, %bb31 ]
+  %bc.merge.rdx = phi double [ 0.000000e+00, %bb31 ], [ %bin.rdx18, %middle.block ]
+  br label %bb40
+
+bb33.loopexit:                                    ; preds = %middle.block, %bb40
+  %tpm50.lcssa = phi double [ %tpm50, %bb40 ], [ %bin.rdx18, %middle.block ]
+  br label %bb33
+
+bb33:                                             ; preds = %bb33.loopexit, %bb22
+  %tpm34 = phi double [ 0.000000e+00, %bb22 ], [ %tpm50.lcssa, %bb33.loopexit ]
+  %tpm35 = getelementptr inbounds double, double* %tpm19, i64 %tpm23
+  %tpm36 = load double, double* %tpm35, align 8
+  %tpm37 = fsub fast double %tpm36, %tpm34
+  store double %tpm37, double* %tpm35, align 8
+  %tpm38 = add nuw nsw i64 %tpm23, 1
+  %tpm39 = icmp eq i64 %tpm38, %tpm20
+  br i1 %tpm39, label %bb21.loopexit, label %bb22
+
+bb40:                                             ; preds = %bb40, %scalar.ph
+  %tpm41 = phi i32* [ %tpm51, %bb40 ], [ %bc.resume.val, %scalar.ph ]
+  %tpm42 = phi double* [ %tpm52, %bb40 ], [ %bc.resume.val5, %scalar.ph ]
+  %tpm43 = phi double [ %tpm50, %bb40 ], [ %bc.merge.rdx, %scalar.ph ]
+  %tpm44 = load double, double* %tpm42, align 8
+  %tpm45 = load i32, i32* %tpm41, align 4
+  %tpm46 = zext i32 %tpm45 to i64
+  %tpm47 = getelementptr inbounds double, double* %tpm19, i64 %tpm46
+  %tpm48 = load double, double* %tpm47, align 8
+  %tpm49 = fmul fast double %tpm48, %tpm44
+  %tpm50 = fadd fast double %tpm49, %tpm43
+  %tpm51 = getelementptr inbounds i32, i32* %tpm41, i64 1
+  %tpm52 = getelementptr inbounds double, double* %tpm42, i64 1
+  %tpm53 = icmp eq i32* %tpm51, %tpm29
+  br i1 %tpm53, label %bb33.loopexit, label %bb40
+}
+declare dereferenceable(128) %8* @test1(%19*)
+declare dereferenceable(88) %18* @test2(%18*, %18* dereferenceable(88))
+define internal void @1() section ".text.startup" {
+bb:
+  tail call void @test3(%0* nonnull @0)
+  %tpm = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%0*)* @test4 to void (i8*)*), i8* getelementptr inbounds (%0, %0* @0, i64 0, i32 0), i8* nonnull @__dso_handle)
+  ret void
+}