Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -199,6 +199,11 @@
     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
     cl::desc("The maximum allowed number of runtime memory checks"));
 
+static cl::opt<bool> IgnoreOutOfLoopReductionCost(
+    "vectorizer-ignore-out-of-loop-reduction-cost", cl::init(false),
+    cl::desc("Ignore the cost of out-of-loop reductions in vectorizer cost "
+             "model"));
+
 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
 // that predication is preferred, and this lists all options. I.e., the
 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -1654,6 +1659,9 @@
   /// \p VF is the vectorization factor chosen for the original loop.
   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
 
+  /// Returns total cost for out-of-loop reductions.
+  InstructionCost getOutOfLoopReductionCost(VectorizationFactor VF);
+
 private:
   unsigned NumPredStores = 0;
 
@@ -6505,6 +6513,72 @@
   return Cost;
 }
 
+InstructionCost
+LoopVectorizationCostModel::getOutOfLoopReductionCost(VectorizationFactor VF) {
+  InstructionCost ReduxCost = 0;
+  if (VF.Width.isScalar() || IgnoreOutOfLoopReductionCost)
+    return ReduxCost;
+
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  for (auto &Reduction : Legal->getReductionVars()) {
+    PHINode *Phi = Reduction.first;
+    auto *VectorTy = cast<VectorType>(ToVectorTy(Phi->getType(), VF.Width));
+    const RecurrenceDescriptor &RdxDesc = Reduction.second;
+    if (isInLoopReduction(Phi))
+      continue;
+    RecurKind RK = RdxDesc.getRecurrenceKind();
+    auto FMF = RdxDesc.getFastMathFlags();
+    switch (RK) {
+    case RecurKind::Add:
+    case RecurKind::Mul:
+    case RecurKind::Or:
+    case RecurKind::And:
+    case RecurKind::Xor:
+    case RecurKind::FAdd:
+    case RecurKind::FMul: {
+      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RK);
+      ReduxCost +=
+          TTI.getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
+      break;
+    }
+    case RecurKind::FMax:
+    case RecurKind::FMin:
+    case RecurKind::FMaximum:
+    case RecurKind::FMinimum:
+    case RecurKind::SMax:
+    case RecurKind::SMin:
+    case RecurKind::UMax:
+    case RecurKind::UMin: {
+      Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK);
+      ReduxCost += TTI.getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
+      break;
+    }
+    case RecurKind::FMulAdd: {
+      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RK);
+      ReduxCost +=
+          TTI.getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
+      // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
+      // normal fmul instruction to the cost of the fadd reduction.
+      ReduxCost +=
+          TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
+      break;
+    }
+    case RecurKind::FAnyOf:
+    case RecurKind::IAnyOf: {
+      // This has the cost of vector.reduce.or, but may have other costs as
+      // well. FIXME: This recur kind does not have a well defined cost yet.
+      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RecurKind::Or);
+      ReduxCost +=
+          TTI.getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
+      break;
+    }
+    default:
+      llvm_unreachable("Unexpected reduction operation!");
+    }
+  }
+  return ReduxCost;
+}
+
 std::optional<InstructionCost>
 LoopVectorizationCostModel::getReductionPatternCost(
     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
@@ -9723,14 +9797,28 @@
   }
 }
 
-static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
-                                       VectorizationFactor &VF,
-                                       std::optional<unsigned> VScale, Loop *L,
-                                       ScalarEvolution &SE) {
-  InstructionCost CheckCost = Checks.getCost();
+// What makes the loop unprofitable to vectorize.
+namespace OutOfLoopCost {
+enum Reason {
+  None, // OutOfLoopCost is zero.
+  RuntimeCheck,
+  OutOfLoopReduction,
+  Some // Combination of above reasons: We have both runtime checks and out of
+       // loop reductions.
+};
+}
+
+static OutOfLoopCost::Reason areOutOfLoopComputationsProfitable(
+    InstructionCost CheckCost, InstructionCost ReduxCost,
+    VectorizationFactor &VF, std::optional<unsigned> VScale, Loop *L,
+    ScalarEvolution &SE) {
   if (!CheckCost.isValid())
-    return false;
+    return OutOfLoopCost::RuntimeCheck;
 
+  auto ReduxCostVal = *ReduxCost.getValue();
+  double RtC = *CheckCost.getValue();
+  if (!ReduxCostVal && !RtC)
+    return OutOfLoopCost::None;
   // When interleaving only scalar and vector cost will be equal, which in turn
   // would lead to a divide by 0. Fall back to hard threshold.
   if (VF.Width.isScalar()) {
@@ -9738,15 +9826,21 @@
       LLVM_DEBUG(
           dbgs()
           << "LV: Interleaving only is not profitable due to runtime checks\n");
-      return false;
+      return OutOfLoopCost::RuntimeCheck;
     }
-    return true;
+    if (ReduxCostVal) {
+      LLVM_DEBUG(dbgs() << "Interleaving only is not profitable due to out of "
+                           "loop reductions\n");
+      return OutOfLoopCost::OutOfLoopReduction;
+    }
+    return OutOfLoopCost::None;
   }
 
-  // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
+  // The scalar cost should only be 0 when vectorizing with a user specified
+  // VF/IC. In those cases, ignore out of loop costs.
   double ScalarC = *VF.ScalarCost.getValue();
   if (ScalarC == 0)
-    return true;
+    return OutOfLoopCost::None;
 
   // First, compute the minimum iteration count required so that the vector
   // loop outperforms the scalar loop.
@@ -9757,7 +9851,7 @@
   //  * ScalarC is the cost of a single scalar iteration.
   //
   //  The total cost of the vector loop is
-  //    RtC + VecC * (TC / VF) + EpiC
+  //    RtC + VecC * (TC / VF) + EpiC + ReduxCost
   //  where
   //  * RtC is the cost of the generated runtime checks
   //  * VecC is the cost of a single vector iteration.
@@ -9765,13 +9859,15 @@
   //  * VF is the vectorization factor
   //  * EpiCost is the cost of the generated epilogue, including the cost
   //    of the remaining scalar operations.
+  //  * ReduxCost is the cost of out-of-loop reductions which are executed if
+  //    the vector loop is taken.
   //
   // Vectorization is profitable once the total vector cost is less than the
   // total scalar cost:
-  //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
+  //   RtC + VecC * (TC / VF) + EpiC +ReduxCost <  ScalarC * TC
   //
   // Now we can compute the minimum required trip count TC as
-  //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
+  //   (RtC + EpiC + ReduxCost) / (ScalarC - (VecC / VF)) < TC
   //
   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
   // the computations are performed on doubles, not integers and the result
@@ -9784,8 +9880,7 @@
     IntVF *= AssumedMinimumVscale;
   }
   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
-  double RtC = *CheckCost.getValue();
-  double MinTC1 = RtC / (ScalarC - VecCOverVF);
+  double MinTC1 = (RtC + ReduxCostVal) / (ScalarC - VecCOverVF);
 
   // Second, compute a minimum iteration count so that the cost of the
   // runtime checks is only a fraction of the total scalar loop cost. This
@@ -9794,6 +9889,8 @@
   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
   // cost, compute
   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
+  // Note that we can ignore ReduxCost here since out-of-loop reductions are
+  // computed only if the vector loop is taken. 
   double MinTC2 = RtC * 10 / ScalarC;
 
   // Now pick the larger minimum. If it is not a multiple of VF, choose the
@@ -9802,9 +9899,9 @@
   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
   VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
 
-  LLVM_DEBUG(
-      dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
-             << VF.MinProfitableTripCount << "\n");
+  LLVM_DEBUG(dbgs() << "LV: Minimum required TC for out-of-loop computations "
+                       "to be profitable:"
+                    << VF.MinProfitableTripCount << "\n");
 
   // Skip vectorization if the expected trip count is less than the minimum
   // required trip count.
@@ -9816,10 +9913,14 @@
                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
                         << ")\n");
 
-      return false;
+      // If possible, return the exact reason we cannot vectorize the small trip
+      // count loop.
+      return (!RtC)            ? OutOfLoopCost::OutOfLoopReduction
+             : !(ReduxCostVal) ? OutOfLoopCost::RuntimeCheck
+                               : OutOfLoopCost::Some;
     }
   }
-  return true;
+  return OutOfLoopCost::None;
 }
 
 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
@@ -10010,23 +10111,42 @@
     if (VF.Width.isVector() || SelectedIC > 1)
       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
 
-    // Check if it is profitable to vectorize with runtime checks.
+    // Check if it is profitable to vectorize with out of loop computations
+    // (such as reductions and runtime checks).
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
-    if (!ForceVectorization &&
-        !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
-                                    *PSE.getSE())) {
-      ORE->emit([&]() {
-        return OptimizationRemarkAnalysisAliasing(
-                   DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
-                   L->getHeader())
-               << "loop not vectorized: cannot prove it is safe to reorder "
-                  "memory operations";
-      });
-      LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
-      Hints.emitRemarkWithHints();
-      return false;
-    }
+    if (!ForceVectorization) {
+      InstructionCost RTCheckCost = Checks.getCost();
+      InstructionCost ReduxCost = CM.getOutOfLoopReductionCost(VF);
+
+      auto UnprofitableReason = areOutOfLoopComputationsProfitable(
+          RTCheckCost, ReduxCost, VF, getVScaleForTuning(L, *TTI), L,
+          *PSE.getSE());
+      switch (UnprofitableReason) {
+      case OutOfLoopCost::None:
+        break;
+      case OutOfLoopCost::RuntimeCheck: {
+        ORE->emit([&]() {
+          return OptimizationRemarkAnalysisAliasing(
+                     DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
+                     L->getHeader())
+                 << "loop not vectorized: cannot prove it is safe to reorder "
+                    "memory operations";
+        });
+        LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+        Hints.emitRemarkWithHints();
+        return false;
+      }
+      case OutOfLoopCost::OutOfLoopReduction:
+        LLVM_DEBUG(dbgs() << "LV: Costly out of loop reductions for small "
+                             "trip count loop.\n");
+        return false;
+      default:
+        LLVM_DEBUG(dbgs() << "LV: Costly out of loop computation for small "
+                             "trip count loop.\n");
+        return false;
+      }
+    }// ForceVectorization
   }
 
   // Identify the diagnostic messages that should be produced.
Index: llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
@@ -17,7 +17,7 @@
 ; CHECK-NEXT:    [[CMP_NOT16:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP_NOT16]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
 ; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 16
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -4
@@ -86,7 +86,7 @@
 ; CHECK-NEXT:    [[ACCUM_1]] = phi float [ [[ADD4]], [[IF_THEN]] ], [ [[ACCUM_017]], [[WHILE_BODY]] ]
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLOCKSIZE_ADDR_018]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    [[ACCUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ACCUM_1]], [[IF_END]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[ACCUM_0_LCSSA]]
Index: llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
@@ -34,7 +34,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX113:%.*]] = getelementptr inbounds float, ptr [[T7]], i32 [[T2]]
 ; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[ARRAYIDX113]], align 4
 ; CHECK-NEXT:    [[CONV114:%.*]] = fpext float [[T8]] to double
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[T]], 2
@@ -88,7 +88,7 @@
 ; CHECK-NEXT:    [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]]
 ; CHECK-NEXT:    [[INC129]] = add nuw nsw i32 [[I_2132]], 1
 ; CHECK-NEXT:    [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]]
-; CHECK-NEXT:    br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       outerend:
 ; CHECK-NEXT:    [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll
@@ -16,41 +16,41 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI1]])
-; CHECK-NEXT:    [[TMP3]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]])
+; CHECK-NEXT:    [[TMP1]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI1]])
+; CHECK-NEXT:    [[TMP2]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 2147483647, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ -2147483648, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 2147483647, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ -2147483648, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[MAX_0_LCSSA:%.*]] = phi i32 [ -2147483648, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[MIN_0_LCSSA:%.*]] = phi i32 [ 2147483647, [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[MAX_0_LCSSA:%.*]] = phi i32 [ -2147483648, [[ENTRY:%.*]] ], [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[MIN_0_LCSSA:%.*]] = phi i32 [ 2147483647, [[ENTRY]] ], [ [[COND9:%.*]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    store i32 [[MIN_0_LCSSA]], ptr [[MINP:%.*]], align 4
 ; CHECK-NEXT:    ret i32 [[MAX_0_LCSSA]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[MIN_028:%.*]] = phi i32 [ [[TMP9]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[MAX_027:%.*]] = phi i32 [ [[TMP8]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[MIN_028:%.*]] = phi i32 [ [[COND9]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[MAX_027:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_029]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP8]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[MAX_027]])
-; CHECK-NEXT:    [[TMP9]] = call i32 @llvm.smin.i32(i32 [[TMP7]], i32 [[MIN_028]])
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[COND]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[MAX_027]])
+; CHECK-NEXT:    [[COND9]] = call i32 @llvm.smin.i32(i32 [[TMP6]], i32 [[MIN_028]])
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_029]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
   %cmp26.not = icmp eq i32 %N, 0
Index: llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
+++ llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
@@ -78,34 +78,35 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 4, i64 [[TMP1]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP7]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP8]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP7]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP8]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -117,7 +118,7 @@
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
 ;
 entry:
Index: llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -14,48 +14,49 @@
 ; OUTLOOP:       for.body.preheader:
 ; OUTLOOP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; OUTLOOP-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
-; OUTLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]]
+; OUTLOOP-NEXT:    [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 8, i32 [[TMP1]])
+; OUTLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP2]]
 ; OUTLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; OUTLOOP:       vector.ph:
-; OUTLOOP-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; OUTLOOP-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
-; OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]]
+; OUTLOOP-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; OUTLOOP-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
+; OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP4]]
 ; OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; OUTLOOP:       vector.body:
 ; OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; OUTLOOP-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
-; OUTLOOP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]]
-; OUTLOOP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP6]], align 2
-; OUTLOOP-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; OUTLOOP-NEXT:    [[TMP8]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP7]]
-; OUTLOOP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; OUTLOOP-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
-; OUTLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP10]]
-; OUTLOOP-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; OUTLOOP-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; OUTLOOP-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
+; OUTLOOP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP5]]
+; OUTLOOP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
+; OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2
+; OUTLOOP-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; OUTLOOP-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP8]]
+; OUTLOOP-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vscale.i32()
+; OUTLOOP-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], 4
+; OUTLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP11]]
+; OUTLOOP-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; OUTLOOP-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; OUTLOOP:       middle.block:
-; OUTLOOP-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP8]])
+; OUTLOOP-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP9]])
 ; OUTLOOP-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; OUTLOOP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; OUTLOOP:       scalar.ph:
 ; OUTLOOP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; OUTLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; OUTLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
 ; OUTLOOP:       for.body:
 ; OUTLOOP-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; OUTLOOP-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
 ; OUTLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
-; OUTLOOP-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; OUTLOOP-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
+; OUTLOOP-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; OUTLOOP-NEXT:    [[CONV:%.*]] = sext i16 [[TMP14]] to i32
 ; OUTLOOP-NEXT:    [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
 ; OUTLOOP-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; OUTLOOP-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; OUTLOOP-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; OUTLOOP:       for.cond.cleanup.loopexit:
-; OUTLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; OUTLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; OUTLOOP-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; OUTLOOP:       for.cond.cleanup:
 ; OUTLOOP-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
Index: llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
+++ llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
@@ -402,36 +402,37 @@
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 4, i64 [[TMP1]])
+; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
-; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; VLENUNK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLENUNK:       vector.body:
 ; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLENUNK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]]
-; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
-; VLENUNK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
-; VLENUNK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; VLENUNK-NEXT:    [[TMP8]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; VLENUNK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; VLENUNK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLENUNK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VLENUNK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; VLENUNK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]]
+; VLENUNK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
+; VLENUNK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; VLENUNK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; VLENUNK-NEXT:    [[TMP9]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
+; VLENUNK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; VLENUNK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; VLENUNK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLENUNK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VLENUNK:       middle.block:
-; VLENUNK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP8]])
+; VLENUNK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP9]])
 ; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; VLENUNK:       scalar.ph:
 ; VLENUNK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VLENUNK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; VLENUNK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
 ; VLENUNK:       for.body:
 ; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -445,43 +446,44 @@
 ; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; VLENUNK:       for.end:
-; VLENUNK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; VLENUNK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; VLENUNK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
 ;
 ; VLEN128-LABEL: @indexed_load(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 4, i64 [[TMP1]])
+; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
-; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; VLEN128-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLEN128:       vector.body:
 ; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]]
-; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
-; VLEN128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
-; VLEN128-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
-; VLEN128-NEXT:    [[TMP8]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; VLEN128-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; VLEN128-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VLEN128-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VLEN128-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; VLEN128-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]]
+; VLEN128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
+; VLEN128-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; VLEN128-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; VLEN128-NEXT:    [[TMP9]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
+; VLEN128-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; VLEN128-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLEN128-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VLEN128:       middle.block:
-; VLEN128-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP8]])
+; VLEN128-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP9]])
 ; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; VLEN128:       scalar.ph:
 ; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VLEN128-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; VLEN128-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
 ; VLEN128:       for.body:
 ; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -495,7 +497,7 @@
 ; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; VLEN128:       for.end:
-; VLEN128-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; VLEN128-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; VLEN128-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
 ;
 entry:
Index: llvm/test/Transforms/LoopVectorize/X86/ctpop-small-trip-count.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/X86/ctpop-small-trip-count.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=loop-vectorize -mcpu=znver2 -vectorizer-ignore-out-of-loop-reduction-cost=0 -force-vector-interleave=1  < %s | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR 57476
+; Hard-coded value of trip count being 2.
+; FIXME: We still vectorize it, since reduction cost is 1 (for VF=2).
+define i64 @test_trip_count_2(ptr %arr) {
+; CHECK-LABEL: define i64 @test_trip_count_2(
+; CHECK-SAME: ptr [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP4]] = add <2 x i64> [[VEC_PHI]], [[TMP3]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP4]])
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i64 [ [[ACCUM_NEXT:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[IV]]
+; CHECK-NEXT:    [[VALUE:%.*]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[CTPOP:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[VALUE]])
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i64 [[ACCUM]], [[CTPOP]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[LCSSA:%.*]] = phi i64 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %accum = phi i64 [ %accum.next, %loop ], [ 0, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %iv.next = add nuw i64 %iv, 1
+  %gep = getelementptr inbounds i64, ptr %arr, i64 %iv
+  %value = load i64, ptr %gep, align 8
+  %ctpop = tail call i64 @llvm.ctpop.i64(i64 %value)
+  %accum.next = add i64 %accum, %ctpop
+  %exitcond = icmp eq i64 %iv.next, 2
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %lcssa = phi i64 [ %accum.next, %loop ]
+  ret i64 %lcssa
+}
+
+; Same loop as above, with profile showing trip count of 2.
+; We do not vectorize this when we consider cost of reductions since reduction
+; cost along with vectorcost (with VF=4) is higher than scalar cost.
+define i64 @test_trip_count_prof_2(ptr %arr, i64 %n) {
+; CHECK-LABEL: define i64 @test_trip_count_prof_2(
+; CHECK-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i64 [ [[ACCUM_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[IV]]
+; CHECK-NEXT:    [[VALUE:%.*]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[CTPOP:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[VALUE]])
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i64 [[ACCUM]], [[CTPOP]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]], !prof [[PROF4:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[LCSSA:%.*]] = phi i64 [ [[ACCUM_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %accum = phi i64 [ %accum.next, %loop ], [ 0, %entry ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %iv.next = add nuw i64 %iv, 1
+  %gep = getelementptr inbounds i64, ptr %arr, i64 %iv
+  %value = load i64, ptr %gep, align 8
+  %ctpop = tail call i64 @llvm.ctpop.i64(i64 %value)
+  %accum.next = add i64 %accum, %ctpop
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %exit, label %loop, !prof !2
+
+exit:
+  %lcssa = phi i64 [ %accum.next, %loop ]
+  ret i64 %lcssa
+}
+declare i64 @llvm.ctpop.i64(i64)
+
+!2 = !{!"branch_weights", i32 1, i32 2}
Index: llvm/test/Transforms/LoopVectorize/X86/reduction-small-trip-count.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/X86/reduction-small-trip-count.ll
@@ -0,0 +1,94 @@
+; RUN: opt -S -passes=loop-vectorize,dce -mcpu=skylake -vectorizer-ignore-out-of-loop-reduction-cost=0 -force-vector-interleave=1  < %s | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+
+declare float @llvm.maximum.f32(float, float)
+declare float @llvm.fabs.f32(float)
+
+; This is a small trip count loop. The cost of the out-of-loop reduction is
+; significant in this case when we only perform a single vector iteration.  
+; However, loop vectorizer does not consider out of loop reduction costs.
+
+; CHECK-LABEL: fmaximum_intrinsic
+; CHECK-NOT: llvm.vector.reduce.fmaximum 
+define float @fmaximum_intrinsic(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n, i32 %tc) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.012 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %s.011 = phi float [ 0.000000e+00, %entry ], [ %max, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.012
+  %x_f = load float, ptr %arrayidx, align 4
+  %arrayidxy = getelementptr inbounds float, ptr %y, i32 %i.012
+  %y_f = load float, ptr %arrayidxy, align 4
+  %sub = fsub float %x_f, %y_f
+  %fabs = call float @llvm.fabs.f32(float %sub)
+  %max = tail call float @llvm.maximum.f32(float %s.011, float %fabs)
+  %inc = add nuw nsw i32 %i.012, 1
+  %exitcond = icmp ult i32 %inc, 3
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret float %max
+}
+
+; trip count of 6 is still considered non-profitable for reducing adds (min trip
+; count required is 8).
+; CHECK-LABEL: reduction_sum
+; CHECK-NOT: llvm.vector.reduce.add
+define i32 @reduction_sum(i32 %n, ptr noalias nocapture %A, ptr noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %3 = load i32, ptr %2, align 4
+  %4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %5 = load i32, ptr %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph, !prof !1
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+; CHECK-LABEL: reduction_mix
+; CHECK-LABEL: middle.block:
+; CHECK-NEXT: vector.reduce.add 
+; CHECK-NEXT: br
+define i32 @reduction_mix(i32 %n, ptr noalias nocapture %A, ptr noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %3 = load i32, ptr %2, align 4
+  %4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %5 = load i32, ptr %4, align 4
+  %6 = mul nsw i32 %5, %3
+  %7 = trunc i64 %indvars.iv to i32
+  %8 = add i32 %sum.02, %7
+  %9 = add i32 %8, %6
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph, !prof !2
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+!1 = !{!"branch_weights", i32 1, i32 5}
+!2 = !{!"branch_weights", i32 1, i32 7}