diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -318,6 +318,11 @@ unsigned RolledDynamicCost; }; +enum class UnrollingOption : bool { + Full, + Partial, +}; + } // end anonymous namespace /// Figure out if the loop is worth full unrolling. @@ -753,49 +758,34 @@ } }; -// Returns true if unroll count was set explicitly. -// Calculates unroll count and writes it to UP.Count. -// Unless IgnoreUser is true, will also use metadata and command-line options -// that are specific to to the LoopUnroll pass (which, for instance, are -// irrelevant for the LoopUnrollAndJam pass). -// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes -// many LoopUnroll-specific options. The shared functionality should be -// refactored into it own function. -bool llvm::computeUnrollCount( - Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, - ScalarEvolution &SE, const SmallPtrSetImpl &EphValues, - OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, - bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize, - TargetTransformInfo::UnrollingPreferences &UP, - TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) { +bool shouldPragmaUnroll(Loop *L, OptimizationRemarkEmitter *ORE, + const unsigned TripMultiple, const unsigned TripCount, + UnrollCostEstimator UCE, + TargetTransformInfo::UnrollingPreferences &UP) { - UnrollCostEstimator UCE(*L, LoopSize); + const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; - // Use an explicit peel count that has been specified for testing. In this - // case it's not permitted to also specify an explicit unroll count. - if (PP.PeelCount) { - if (UnrollCount.getNumOccurrences() > 0) { - report_fatal_error("Cannot specify both explicit peel count and " - "explicit unroll count"); - } - UP.Count = 1; - UP.Runtime = false; - return true; - } + const bool PragmaFullUnroll = hasUnrollFullPragma(L); + const unsigned PragmaCount = unrollCountPragmaValue(L); + const bool PragmaEnableUnroll = hasUnrollEnablePragma(L); + const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll || + PragmaEnableUnroll || UserUnrollCount; - // Check for explicit Count. + DEBUG_WITH_TYPE("mypass", dbgs() << "With TripCount: " << TripCount + << " Entered Unrolling\n"); + + // Using unroll pragma // 1st priority is unroll count set by "unroll-count" option. - bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; - if (UserUnrollCount) { + + if (UnrollCount.getNumOccurrences() > 0) { UP.Count = UnrollCount; UP.AllowExpensiveTripCount = true; UP.Force = true; if (UP.AllowRemainder && UCE.getUnrolledLoopSize(UP) < UP.Threshold) - return true; + return false; } // 2nd priority is unroll count set by pragma. - unsigned PragmaCount = unrollCountPragmaValue(L); if (PragmaCount > 0) { UP.Count = PragmaCount; UP.Runtime = true; @@ -803,19 +793,15 @@ UP.Force = true; if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) && UCE.getUnrolledLoopSize(UP) < PragmaUnrollThreshold) - return true; + return false; } - bool PragmaFullUnroll = hasUnrollFullPragma(L); + if (PragmaFullUnroll && TripCount != 0) { UP.Count = TripCount; if (UCE.getUnrolledLoopSize(UP) < PragmaUnrollThreshold) return false; } - bool PragmaEnableUnroll = hasUnrollEnablePragma(L); - bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll || - PragmaEnableUnroll || UserUnrollCount; - if (ExplicitUnroll && TripCount != 0) { // If the loop has an unrolling pragma, we want to be more aggressive with // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold @@ -825,21 +811,19 @@ std::max(UP.PartialThreshold, PragmaUnrollThreshold); } - // 3rd priority is full unroll count. - // Full unroll makes sense only when TripCount or its upper bound could be - // statically calculated. - // Also we need to check if we exceed FullUnrollMaxCount. - // If using the upper bound to unroll, TripMultiple should be set to 1 because - // we do not know when loop may exit. + // if didn't return until here, should continue to other priorties + UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount; + return true; +} + +bool shouldFullUnroll(Loop *L, const TargetTransformInfo &TTI, + DominatorTree &DT, ScalarEvolution &SE, + const SmallPtrSetImpl &EphValues, + OptimizationRemarkEmitter *ORE, const bool MaxOrZero, + const unsigned TripCount, const unsigned MaxTripCount, + UnrollCostEstimator UCE, + TargetTransformInfo::UnrollingPreferences &UP) { - // We can unroll by the upper bound amount if it's generally allowed or if - // we know that the loop is executed either the upper bound or zero times. - // (MaxOrZero unrolling keeps only the first loop test, so the number of - // loop tests remains the same compared to the non-unrolled version, whereas - // the generic upper bound unrolling keeps all but the last loop test so the - // number of loop tests goes up which may end up being worse on targets with - // constrained branch predictor resources so is controlled by an option.) - // In addition we only unroll small upper bounds. unsigned FullUnrollMaxTripCount = MaxTripCount; if (!(UP.UpperBound || MaxOrZero) || FullUnrollMaxTripCount > UnrollMaxUpperBound) @@ -854,14 +838,12 @@ unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount; UP.Count = FullUnrollTripCount; + if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) { - UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); - TripCount = FullUnrollTripCount; - TripMultiple = UP.UpperBound ? 1 : TripMultiple; - return ExplicitUnroll; + return false; } else { // The loop isn't that small, but we still can fully unroll it if that // helps to remove a significant number of instructions. @@ -873,32 +855,24 @@ unsigned Boost = getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); if (Cost->UnrolledCost < UP.Threshold * Boost / 100) { - UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); - TripCount = FullUnrollTripCount; - TripMultiple = UP.UpperBound ? 1 : TripMultiple; - return ExplicitUnroll; + return false; } } } } + return true; +} - // 4th priority is loop peeling. - computePeelCount(L, LoopSize, PP, TripCount, SE, UP.Threshold); - if (PP.PeelCount) { - UP.Runtime = false; - UP.Count = 1; - return ExplicitUnroll; - } +bool shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount, + UnrollCostEstimator UCE, + TargetTransformInfo::UnrollingPreferences &UP) { - // 5th priority is partial unrolling. - // Try partial unroll only when TripCount could be statically calculated. if (TripCount) { - UP.Partial |= ExplicitUnroll; if (!UP.Partial) { LLVM_DEBUG(dbgs() << " will not try to unroll partially because " << "-unroll-allow-partial not given\n"); UP.Count = 0; - return false; + return true; } if (UP.Count == 0) UP.Count = TripCount; @@ -923,15 +897,6 @@ UP.Count >>= 1; } if (UP.Count < 2) { - if (PragmaEnableUnroll) - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, - "UnrollAsDirectedTooLarge", - L->getStartLoc(), L->getHeader()) - << "Unable to unroll loop as directed by unroll(enable) " - "pragma " - "because unrolled size is too large."; - }); UP.Count = 0; } } else { @@ -939,6 +904,123 @@ } if (UP.Count > UP.MaxCount) UP.Count = UP.MaxCount; + + LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count + << "\n"); + + return false; + } + + // if didn't return until here, should continue to other priorties + return true; +} +// Returns true if unroll count was set explicitly. +// Calculates unroll count and writes it to UP.Count. +// Unless IgnoreUser is true, will also use metadata and command-line options +// that are specific to to the LoopUnroll pass (which, for instance, are +// irrelevant for the LoopUnrollAndJam pass). +// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes +// many LoopUnroll-specific options. The shared functionality should be +// refactored into it own function. + +bool llvm::computeUnrollCount( + Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, + ScalarEvolution &SE, const SmallPtrSetImpl &EphValues, + OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, + bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize, + TargetTransformInfo::UnrollingPreferences &UP, + TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) { + + UnrollCostEstimator UCE(*L, LoopSize); + + bool ShouldContinue; + + const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; + + const bool PragmaFullUnroll = hasUnrollFullPragma(L); + const unsigned PragmaCount = unrollCountPragmaValue(L); + const bool PragmaEnableUnroll = hasUnrollEnablePragma(L); + const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll || + PragmaEnableUnroll || UserUnrollCount; + + // Check for explicit Count. + // 1st priority is unroll count set by "unroll-count" option. + // 2nd priority is unroll count set by pragma. + ShouldContinue = shouldPragmaUnroll(L, ORE, TripMultiple, TripCount, UCE, UP); + + DEBUG_WITH_TYPE("mypass", dbgs() << "After trying for full unroll " + << "Continue: " << int(ShouldContinue) + << " Explicit: " << int(ExplicitUnroll) + << "\n"); + + if (ShouldContinue == false) + return ExplicitUnroll; + + // 3rd priority is full unroll count. + // Full unroll makes sense only when TripCount or its upper bound could be + // statically calculated. + // Also we need to check if we exceed FullUnrollMaxCount. + // If using the upper bound to unroll, TripMultiple should be set to 1 because + // we do not know when loop may exit. + + // We can unroll by the upper bound amount if it's generally allowed or if + // we know that the loop is executed either the upper bound or zero times. + // (MaxOrZero unrolling keeps only the first loop test, so the number of + // loop tests remains the same compared to the non-unrolled version, whereas + // the generic upper bound unrolling keeps all but the last loop test so the + // number of loop tests goes up which may end up being worse on targets with + // constrained branch predictor resources so is controlled by an option.) + // In addition we only unroll small upper bounds. + + ShouldContinue = shouldFullUnroll(L, TTI, DT, SE, EphValues, ORE, MaxOrZero, + TripCount, MaxTripCount, UCE, UP); + + unsigned FullUnrollMaxTripCount = MaxTripCount; + if (!(UP.UpperBound || MaxOrZero) || + FullUnrollMaxTripCount > UnrollMaxUpperBound) + FullUnrollMaxTripCount = 0; + + unsigned ExactTripCount = TripCount; + unsigned FullUnrollTripCount = + ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount; + + // if shouldFullUnroll can do the unrolling, some side parameteres should be + // set + if (ShouldContinue == false) { + UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); + TripCount = FullUnrollTripCount; + TripMultiple = UP.UpperBound ? 1 : TripMultiple; + return ExplicitUnroll; + } + + // 4th priority is loop peeling. + computePeelCount(L, LoopSize, PP, TripCount, SE, UP.Threshold); + if (PP.PeelCount) { + UP.Runtime = false; + UP.Count = 1; + DEBUG_WITH_TYPE("mypass", dbgs() << "After trying for peeling " + << "Continue: " << int(ShouldContinue) + << " Explicit: " << int(ExplicitUnroll) + << "\n"); + return ExplicitUnroll; + } + + // Before starting partial unrolling, set up.partial to true, + // if user explicitly asked for unrolling + if (TripCount) + UP.Partial |= ExplicitUnroll; + + // 5th priority is partial unrolling. + // Try partial unroll only when TripCount could be statically calculated. + ShouldContinue = shouldPartialUnroll(LoopSize, TripCount, UCE, UP); + + DEBUG_WITH_TYPE("mypass", dbgs() << "After trying for partial unroll " + << "Continue: " << int(ShouldContinue) + << " Explicit: " << int(ExplicitUnroll) + << "\n"); + + if (ShouldContinue == false) { + if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && UP.Count != TripCount) ORE->emit([&]() { @@ -949,10 +1031,24 @@ "because " "unrolled size is too large."; }); - LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count - << "\n"); + + if (UP.PartialThreshold != NoThreshold) { + if (UP.Count == 0) { + if (PragmaEnableUnroll) + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, + "UnrollAsDirectedTooLarge", + L->getStartLoc(), L->getHeader()) + << "Unable to unroll loop as directed by unroll(enable) " + "pragma " + "because unrolled size is too large."; + }); + } + } + return ExplicitUnroll; } + assert(TripCount == 0 && "All cases when TripCount is constant should be covered here."); if (PragmaFullUnroll) @@ -988,8 +1084,6 @@ } } - // Reduce count based on the type of unrolling and the threshold values. - UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount; if (!UP.Runtime) { LLVM_DEBUG( dbgs() << " will not try to unroll loop with runtime trip count " @@ -1023,7 +1117,7 @@ using namespace ore; - if (PragmaCount > 0 && !UP.AllowRemainder) + if (unrollCountPragmaValue(L) > 0 && !UP.AllowRemainder) ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "DifferentUnrollCountFromDirected",