diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1558,6 +1558,11 @@
         false; ///< If op is an fp min/max, whether NaNs may be present.
   };
 
+  /// \returns True if target decides it is profitable to vectorize reductions
+  /// for small trip count loop.
+  bool isProfitableToVectorizeReductionForSmallTC(
+      const RecurrenceDescriptor &RdxDesc) const;
+
   /// \returns True if the target prefers reductions in loop.
   bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                              ReductionFlags Flags) const;
@@ -2001,6 +2006,8 @@
   virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
                                         unsigned ChainSizeInBytes,
                                         VectorType *VecTy) const = 0;
+  virtual bool isProfitableToVectorizeReductionForSmallTC(
+      const RecurrenceDescriptor &RdxDesc) const = 0;
   virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                                      ReductionFlags) const = 0;
   virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
@@ -2676,6 +2683,10 @@
                                 VectorType *VecTy) const override {
     return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
   }
+  bool isProfitableToVectorizeReductionForSmallTC(
+      const RecurrenceDescriptor &RdxDesc) const override {
+    return Impl.isProfitableToVectorizeReductionForSmallTC(RdxDesc);
+  }
   bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                              ReductionFlags Flags) const override {
     return Impl.preferInLoopReduction(Opcode, Ty, Flags);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -848,6 +848,11 @@
     return VF;
   }
 
+  bool isProfitableToVectorizeReductionForSmallTC(
+      const RecurrenceDescriptor &RdxDesc) const {
+    return true;
+  }
+
   bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const {
     return false;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1181,6 +1181,11 @@
   return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
 }
 
+bool TargetTransformInfo::isProfitableToVectorizeReductionForSmallTC(
+    const RecurrenceDescriptor &RdxDesc) const {
+  return TTIImpl->isProfitableToVectorizeReductionForSmallTC(RdxDesc);
+}
+
 bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty,
                                                 ReductionFlags Flags) const {
   return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -281,6 +281,8 @@
 
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
+  bool isProfitableToVectorizeReductionForSmallTC(
+      const RecurrenceDescriptor &RdxDesc) const;
   bool prefersVectorizedAddressing() const;
   bool supportsEfficientVectorElementLoadStore() const;
   bool enableInterleavedAccessVectorization();
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -49,6 +49,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86TargetTransformInfo.h"
+#include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/CostTable.h"
@@ -6126,6 +6127,20 @@
   return Options;
 }
 
+bool X86TTIImpl::isProfitableToVectorizeReductionForSmallTC(
+    const RecurrenceDescriptor &RdxDesc) const {
+  // FMinimum and FMaximum reductions are not profitable to vectorize for small
+  // trip counts.
+  // TODO: Check if other reductions are profitable for small trip counts?
+  switch (RdxDesc.getRecurrenceKind()) {
+  case RecurKind::FMinimum:
+  case RecurKind::FMaximum:
+    return false;
+  default:
+    return true;
+  }
+}
+
 bool X86TTIImpl::prefersVectorizedAddressing() const {
   return supportsGather();
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10229,6 +10229,12 @@
     else {
       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
         LLVM_DEBUG(dbgs() << "\n");
+        if (!all_of(LVL.getReductionVars(), [&](auto &Reduction) -> bool {
+              const RecurrenceDescriptor &RdxDesc = Reduction.second;
+              return TTI->isProfitableToVectorizeReductionForSmallTC(RdxDesc);
+            }))
+          return false;
+
         SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
       } else {
         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-trip-count.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-trip-count.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-trip-count.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -passes=loop-vectorize,dce -mcpu=skylake -force-vector-interleave=1  < %s | FileCheck %s
+;target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare float @llvm.maximum.f32(float, float)
+declare float @llvm.fabs.f32(float)
+
+; This is a small trip count loop. The cost of the out-of-loop reduction is
+; significant in this case when we only perform a single vector iteration.  
+; However, loop vectorizer does not consider out of loop reduction costs.
+
+; CHECK-NOT: <4 x float>
+define float @fmaximum_intrinsic(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n, i32 %tc) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.012 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %s.011 = phi float [ 0.000000e+00, %entry ], [ %max, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.012
+  %x_f = load float, ptr %arrayidx, align 4
+  %arrayidxy = getelementptr inbounds float, ptr %y, i32 %i.012
+  %y_f = load float, ptr %arrayidxy, align 4
+  %sub = fsub float %x_f, %y_f
+  %fabs = call float @llvm.fabs.f32(float %sub)
+  %max = tail call float @llvm.maximum.f32(float %s.011, float %fabs)
+  %inc = add nuw nsw i32 %i.012, 1
+  %exitcond = icmp ult i32 %inc, 3
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret float %max
+}
+
+!103 = !{!"branch_weights", i32 2, i32 1}