diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1108,22 +1108,13 @@ enum ScalarEpilogueLowering { // The default: allowing scalar epilogues. - CM_ScalarEpilogueAllowed, - - // Vectorization with OptForSize: don't allow epilogues. - CM_ScalarEpilogueNotAllowedOptSize, - - // A special case of vectorisation with OptForSize: loops with a very small - // trip count are considered for vectorization under OptForSize, thereby - // making sure the cost of their loop body is dominant, free of runtime - // guards and scalar iteration overheads. - CM_ScalarEpilogueNotAllowedLowTripLoop, + CM_SEL_Allowed, // Loop hint predicate indicating an epilogue is undesired. - CM_ScalarEpilogueNotNeededUsePredicate, + CM_SEL_PredicateElseScalar, // Directive indicating we must either tail fold or not vectorize - CM_ScalarEpilogueNotAllowedUsePredicate + CM_SEL_PredicateOrDontVectorize }; /// ElementCountComparator creates a total ordering for ElementCount @@ -1165,7 +1156,7 @@ /// \return True if runtime checks are required for vectorization, and false /// otherwise. - bool runtimeChecksRequired(); + //bool runtimeChecksRequired(); /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO @@ -1518,7 +1509,7 @@ /// Returns true if a scalar epilogue is not allowed due to optsize or a /// loop hint annotation. bool isScalarEpilogueAllowed() const { - return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; + return ScalarEpilogueStatus == CM_SEL_Allowed; } /// Returns true if all loop blocks should be masked to fold tail loop. @@ -1695,7 +1686,7 @@ /// or as a peel-loop to handle gaps in interleave-groups. /// Under optsize and when the trip count is very small we don't allow any /// iterations to execute in the scalar loop. - ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + ScalarEpilogueLowering ScalarEpilogueStatus = CM_SEL_Allowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. bool FoldTailByMasking = false; @@ -4830,10 +4821,14 @@ Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::runtimeChecksRequired() { +static bool runtimeChecksRequired( + LoopVectorizationLegality &Legal, + PredicatedScalarEvolution &PSE, + OptimizationRemarkEmitter *ORE, + Loop *TheLoop) { LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); - if (Legal->getRuntimePointerChecking()->Need) { + if (Legal.getRuntimePointerChecking()->Need) { reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", "runtime pointer checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " @@ -4852,7 +4847,7 @@ } // FIXME: Avoid specializing for stride==1 instead of bailing out. - if (!Legal->getLAI()->getSymbolicStrides().empty()) { + if (!Legal.getLAI()->getSymbolicStrides().empty()) { reportVectorizationFailure("Runtime stride check for small trip count", "runtime stride == 1 checks needed. Enable vectorization of " "this loop without such check by compiling with -Os/-Oz", @@ -5051,34 +5046,10 @@ return FixedScalableVFPair::getNone(); } - switch (ScalarEpilogueStatus) { - case CM_ScalarEpilogueAllowed: + if (ScalarEpilogueStatus == CM_SEL_Allowed) return computeFeasibleMaxVF(TC, UserVF, false); - case CM_ScalarEpilogueNotAllowedUsePredicate: - [[fallthrough]]; - case CM_ScalarEpilogueNotNeededUsePredicate: - LLVM_DEBUG( - dbgs() << "LV: vector predicate hint/switch found.\n" - << "LV: Not allowing scalar epilogue, creating predicated " - << "vector loop.\n"); - break; - case CM_ScalarEpilogueNotAllowedLowTripLoop: - // fallthrough as a special case of OptForSize - case CM_ScalarEpilogueNotAllowedOptSize: - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) - LLVM_DEBUG( - dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); - else - LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " - << "count.\n"); - - // Bail if runtime checks are required, which are not good when optimising - // for size. - if (runtimeChecksRequired()) - return FixedScalableVFPair::getNone(); - break; - } + LLVM_DEBUG(dbgs() << "LV: Choosed to fold vector tail by masking.\n"); // The only loops we can vectorize without a scalar epilogue, are loops with // a bottom-test and a single exiting block. We'd have to handle the fact @@ -5087,10 +5058,10 @@ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + if (ScalarEpilogueStatus == CM_SEL_PredicateElseScalar) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); - ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + ScalarEpilogueStatus = CM_SEL_Allowed; return computeFeasibleMaxVF(TC, UserVF, false); } return FixedScalableVFPair::getNone(); @@ -5144,14 +5115,14 @@ // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + if (ScalarEpilogueStatus == CM_SEL_PredicateElseScalar) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); - ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + ScalarEpilogueStatus = CM_SEL_Allowed; return MaxFactors; } - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { + if (ScalarEpilogueStatus == CM_SEL_PredicateOrDontVectorize) { LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); return FixedScalableVFPair::getNone(); } @@ -9832,12 +9803,14 @@ // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing // predication, and 4) a TTI hook that analyses whether the loop is suitable // for predication. -static Optional getScalarEpilogueLowering( +Optional getScalarEpilogueLowering( Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, - AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, - LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI, - OptimizationRemarkEmitter *ORE) { + AssumptionCache *AC, LoopInfo *LI, PredicatedScalarEvolution &PSE, + DominatorTree *DT, LoopVectorizationLegality &LVL, + InterleavedAccessInfo *IAI, OptimizationRemarkEmitter *ORE) { + auto &SE = *PSE.getSE(); + // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. // (For PGSO, as shouldOptimizeForSize isn't currently accessible from @@ -9848,43 +9821,54 @@ // back to the old way and vectorize with versioning when forced. See D81345.) if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, PGSOQueryType::IRPass) && - Hints.getForce() != LoopVectorizeHints::FK_Enabled)) - return CM_ScalarEpilogueNotAllowedOptSize; + Hints.getForce() != LoopVectorizeHints::FK_Enabled)) { + if (runtimeChecksRequired(LVL, PSE, ORE, L)) { + LLVM_DEBUG( + dbgs() + << "LV: Not vectorizing due to runtime checks with -Os/-Oz.\n"); + return None; + } + LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); + return CM_SEL_PredicateOrDontVectorize; + } // 2) If set, obey the directives if (PreferPredicateOverEpilogue.getNumOccurrences()) { switch (PreferPredicateOverEpilogue) { case PreferPredicateTy::ScalarEpilogue: - return CM_ScalarEpilogueAllowed; + return CM_SEL_Allowed; case PreferPredicateTy::PredicateElseScalarEpilogue: - return CM_ScalarEpilogueNotNeededUsePredicate; + return CM_SEL_PredicateElseScalar; case PreferPredicateTy::PredicateOrDontVectorize: - return CM_ScalarEpilogueNotAllowedUsePredicate; + return CM_SEL_PredicateOrDontVectorize; }; } // 3) If set, obey the hints switch (Hints.getPredicate()) { case LoopVectorizeHints::FK_Enabled: - return CM_ScalarEpilogueNotNeededUsePredicate; + return CM_SEL_PredicateElseScalar; case LoopVectorizeHints::FK_Disabled: - return CM_ScalarEpilogueAllowed; + return CM_SEL_Allowed; }; // 4) Check the loop for a trip count threshold. Vectorize loops with a tiny // trip count by optimizing for size, to minimize overheads. - auto ExpectedTC = getSmallBestKnownTC(*SE, L); + auto ExpectedTC = getSmallBestKnownTC(SE, L); if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is worth vectorizing only if no scalar " - << "iteration overheads are incurred."); + << "iteration overheads are incurred.\n"); if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { - if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { - return CM_ScalarEpilogueNotAllowedLowTripLoop; - } else { - LLVM_DEBUG(dbgs() << " But the target considers the trip count too " + if (runtimeChecksRequired(LVL, PSE, ORE, L)) { + LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " + << "count.\n"); + return None; + } + if (*ExpectedTC <= TTI->getMinTripCountTailFoldingThreshold()) { + LLVM_DEBUG(dbgs() << "But the target considers the trip count too " "small to consider vectorizing.\n"); reportVectorizationFailure( "The trip count is below the minimal threshold value.", @@ -9892,14 +9876,15 @@ "LowTripCount", ORE, L); return None; } + return CM_SEL_PredicateOrDontVectorize; } } // 5) if the TTI hook indicates this is profitable, request predication. - if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI)) - return CM_ScalarEpilogueNotNeededUsePredicate; + if (TTI->preferPredicateOverEpilogue(L, LI, SE, *AC, TLI, DT, &LVL, IAI)) + return CM_SEL_PredicateElseScalar; - return CM_ScalarEpilogueAllowed; + return CM_SEL_Allowed; } Value *VPTransformState::get(VPValue *Def, unsigned Part) { @@ -9991,7 +9976,7 @@ Optional SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, - PSE.getSE(), DT, *LVL, &IAI, ORE); + PSE, DT, *LVL, &IAI, ORE); if (!SEL) { Hints.emitRemarkWithHints(); @@ -10265,7 +10250,7 @@ // Check the function attributes and profiles to find out if this function // should be optimized for size. Optional SEL = getScalarEpilogueLowering( - F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI, ORE); + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE, DT, LVL, &IAI, ORE); if (!SEL) { Hints.emitRemarkWithHints();