diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1558,6 +1558,11 @@ false; ///< If op is an fp min/max, whether NaNs may be present. }; + /// \returns True if target decides it is profitable to vectorize reductions + /// for small trip count loop. + bool isProfitableToVectorizeReductionForSmallTC( + const RecurrenceDescriptor &RdxDesc) const; + /// \returns True if the target prefers reductions in loop. bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const; @@ -2001,6 +2006,8 @@ virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; + virtual bool isProfitableToVectorizeReductionForSmallTC( + const RecurrenceDescriptor &RdxDesc) const = 0; virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, @@ -2676,6 +2683,10 @@ VectorType *VecTy) const override { return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } + bool isProfitableToVectorizeReductionForSmallTC( + const RecurrenceDescriptor &RdxDesc) const override { + return Impl.isProfitableToVectorizeReductionForSmallTC(RdxDesc); + } bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const override { return Impl.preferInLoopReduction(Opcode, Ty, Flags); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -848,6 +848,11 @@ return VF; } + bool isProfitableToVectorizeReductionForSmallTC( + const RecurrenceDescriptor &RdxDesc) const { + return true; + } + bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { return false; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1181,6 +1181,11 @@ return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } +bool TargetTransformInfo::isProfitableToVectorizeReductionForSmallTC( + const RecurrenceDescriptor &RdxDesc) const { + return TTIImpl->isProfitableToVectorizeReductionForSmallTC(RdxDesc); +} + bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const { return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -281,6 +281,8 @@ TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; + bool isProfitableToVectorizeReductionForSmallTC( + const RecurrenceDescriptor &RdxDesc) const; bool prefersVectorizedAddressing() const; bool supportsEfficientVectorElementLoadStore() const; bool enableInterleavedAccessVectorization(); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -49,6 +49,7 @@ //===----------------------------------------------------------------------===// #include "X86TargetTransformInfo.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" @@ -6126,6 +6127,20 @@ return Options; } +bool X86TTIImpl::isProfitableToVectorizeReductionForSmallTC( + const RecurrenceDescriptor &RdxDesc) const { + // FMinimum and FMaximum reductions are not profitable to vectorize for small + // trip counts. + // TODO: Check if other reductions are profitable for small trip counts? + switch (RdxDesc.getRecurrenceKind()) { + case RecurKind::FMinimum: + case RecurKind::FMaximum: + return false; + default: + return true; + } +} + bool X86TTIImpl::prefersVectorizedAddressing() const { return supportsGather(); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -10229,6 +10229,12 @@ else { if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { LLVM_DEBUG(dbgs() << "\n"); + if (!all_of(LVL.getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return TTI->isProfitableToVectorizeReductionForSmallTC(RdxDesc); + })) + return false; + SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; } else { LLVM_DEBUG(dbgs() << " But the target considers the trip count too " diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-trip-count.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-trip-count.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-trip-count.ll @@ -0,0 +1,35 @@ +; RUN: opt -S -passes=loop-vectorize,dce -mcpu=skylake -force-vector-interleave=1 < %s | FileCheck %s +;target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2" +target triple = "x86_64-unknown-linux-gnu" + +declare float @llvm.maximum.f32(float, float) +declare float @llvm.fabs.f32(float) + +; This is a small trip count loop. The cost of the out-of-loop reduction is +; significant in this case when we only perform a single vector iteration. +; However, loop vectorizer does not consider out of loop reduction costs. + +; CHECK-NOT: <4 x float> +define float @fmaximum_intrinsic(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n, i32 %tc) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.012 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %s.011 = phi float [ 0.000000e+00, %entry ], [ %max, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.012 + %x_f = load float, ptr %arrayidx, align 4 + %arrayidxy = getelementptr inbounds float, ptr %y, i32 %i.012 + %y_f = load float, ptr %arrayidxy, align 4 + %sub = fsub float %x_f, %y_f + %fabs = call float @llvm.fabs.f32(float %sub) + %max = tail call float @llvm.maximum.f32(float %s.011, float %fabs) + %inc = add nuw nsw i32 %i.012, 1 + %exitcond = icmp ult i32 %inc, 3 + br i1 %exitcond, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body + ret float %max +} + +!103 = !{!"branch_weights", i32 2, i32 1}