diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" @@ -49,12 +50,28 @@ "Number of select groups converted due to high-predictability"); STATISTIC(NumSelectUnPred, "Number of select groups not converted due to unpredictability"); +STATISTIC(NumSelectConvertedLoop, + "Number of select groups converted due to loop-level analysis"); static cl::opt ColdOperandThreshold( "cold-operand-threshold", cl::desc("Maximum frequency of path for an operand to be considered cold."), cl::init(20), cl::Hidden); +static cl::opt + LoopGradientGain("loop-gradient-gain", + cl::desc("Gradient gain threshold (%)."), cl::init(25), + cl::Hidden); + +static cl::opt + GainCycleThreshold("select-converter-loop-threshold", + cl::desc("Minimum gain per loop (in cycles) threshold."), + cl::init(4), cl::Hidden); + +static cl::opt MispredictDefaultRate( + "mispredict-default-rate", cl::Hidden, cl::init(25), + cl::desc("Default mispredict rate (initialized to 25%).")); + namespace { class SelectOptimize : public FunctionPass { @@ -68,6 +85,10 @@ std::unique_ptr BPI; ProfileSummaryInfo *PSI; OptimizationRemarkEmitter *ORE; + TargetSchedModel TSchedModel; + + // Scaling up factor to avoid precision issues with integer division. + const unsigned ScalingUpFactor = 1000; public: static char ID; @@ -93,6 +114,13 @@ using SelectGroup = SmallVector; using SelectGroups = SmallVector; + struct CostInfo { + /// Predicated cost (with selects as conditional moves). + uint64_t PredCost; + /// Non-predicated cost (with selects converted to branches). + uint64_t NonPredCost; + }; + bool optimizeSelects(Function &F); void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups); void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups); @@ -100,10 +128,21 @@ void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups); void findProfitableSIGroupsBase(SelectGroups &SIGroups, SelectGroups &ProfSIGroups); + void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups, + SelectGroups &ProfSIGroups); bool isConvertToBranchProfitableBase(const SmallVector &ASI); bool hasExpensiveColdOperand(const SmallVector &ASI); - void getOneUseChain(Instruction *I, SmallVector &Chain); - bool isSelectHighlyPredictable(SelectInst *SI); + void getOneUseSlice(Instruction *I, SmallVector &Slice); + bool isSelectHighlyPredictable(const SelectInst *SI); + bool checkLoopHeuristics(CostInfo LoopDepth[2]); + bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups, + DenseMap &InstCostMap, + CostInfo LoopCost[2]); + SmallPtrSet getSIset(const SelectGroups &SIGroups); + Optional computeInstCost(const Instruction *I); + uint64_t getMispredictionCost(const SelectInst *SI, const uint64_t CondCost); + uint64_t predictedPathCost(uint64_t TrueCost, uint64_t FalseCost, + const SelectInst *SI); bool isExpensiveInst(Instruction *I); bool isSelectKindSupported(SelectInst *SI); }; @@ -148,6 +187,7 @@ BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI)); PSI = &getAnalysis().getPSI(); ORE = &getAnalysis().getORE(); + TSchedModel.init(TSI); // When optimizing for size, selects are preferable over branches. if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get())) @@ -190,7 +230,22 @@ void SelectOptimize::optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups) { - llvm_unreachable("Unimplemented"); + SmallVector Loops(LI->begin(), LI->end()); + // Need to check size on each iteration as we accumulate child loops. + for (unsigned long i = 0; i < Loops.size(); ++i) + for (Loop *ChildL : Loops[i]->getSubLoops()) + Loops.push_back(ChildL); + + for (Loop *L : Loops) { + if (!L->isInnermost()) + continue; + + SelectGroups SIGroups; + for (BasicBlock *BB : L->getBlocks()) + collectSelectGroups(*BB, SIGroups); + + findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups); + } } void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) { @@ -234,6 +289,54 @@ } } +void SelectOptimize::findProfitableSIGroupsInnerLoops( + const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { + NumSelectOptAnalyzed += SIGroups.size(); + // For each select group in an inner-most loop, + // a branch is more preferable than a select/conditional-move if: + // i) conversion to branches satisfies loop-level heuristics including + // reducing the loop's critical path by some threshold; and + // ii) the cost of the select group is cheaper (in terms of latency) compared + // to the select version. + + DenseMap InstCostMap; + CostInfo LoopCost[2] = {{0, 0}, {0, 0}}; + if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) || + !checkLoopHeuristics(LoopCost)) { + OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", + L->getHeader()->getFirstNonPHI()); + ORmissL << "Did not convert any select in the loop to branch due to " + "loop-level heuristics."; + ORE->emit(ORmissL); + return; + } + + for (SelectGroup &ASI : SIGroups) { + // Assuming infinite resources, the cost of a group of instructions is the + // cost of the most expensive instruction of the group. + uint64_t SelectCost = 0, BranchCost = 0; + for (SelectInst *SI : ASI) { + SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost); + BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost); + } + if (BranchCost < SelectCost) { + OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front()); + OR << "Profitable to convert to branch (loop analysis). BranchCost=" + << std::to_string(BranchCost) + << ", SelectCost=" << std::to_string(SelectCost) << ". "; + ORE->emit(OR); + ++NumSelectConvertedLoop; + ProfSIGroups.push_back(ASI); + } else { + OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front()); + ORmiss << "Select is more profitable (loop analysis). BranchCost=" + << std::to_string(BranchCost) + << ", SelectCost=" << std::to_string(SelectCost) << ". "; + ORE->emit(ORmiss); + } + } +} + bool SelectOptimize::isConvertToBranchProfitableBase( const SmallVector &ASI) { SelectInst *SI = ASI.front(); @@ -257,7 +360,7 @@ } // Look for expensive operands in the cold operand's (if any) dependence - // chain of any of the selects in the group. + // slice of any of the selects in the group. if (hasExpensiveColdOperand(ASI)) { ++NumSelectConvertedExpColdOperand; OR << "Converted to branch because of expensive cold operand."; @@ -282,7 +385,7 @@ } if (!ColdOperand) return false; - // Check if the cold path's dependence chain has an expensive operation for + // Check if the cold path's dependence slice has an expensive operation for // any of the selects of the group. for (SelectInst *SI : ASI) { Instruction *ColdI = nullptr; @@ -290,10 +393,10 @@ ColdI = dyn_cast(SI->getTrueValue()); else ColdI = dyn_cast(SI->getFalseValue()); - if (ColdI && ColdI->hasOneUse()) { - SmallVector ColdChain; - getOneUseChain(ColdI, ColdChain); - for (auto *ColdII : ColdChain) + if (ColdI) { + SmallVector ColdSlice; + getOneUseSlice(ColdI, ColdSlice); + for (auto *ColdII : ColdSlice) if (isExpensiveInst(ColdII)) return true; } @@ -301,8 +404,8 @@ return false; } -void SelectOptimize::getOneUseChain(Instruction *I, - SmallVector &Chain) { +void SelectOptimize::getOneUseSlice(Instruction *I, + SmallVector &Slice) { std::queue Worklist; Worklist.push(I); SmallPtrSet Visited; @@ -315,7 +418,7 @@ continue; Visited.insert(II); - if (I != II && !II->hasOneUse()) + if (!II->hasOneUse()) continue; // Avoid considering instructions in outer or independent loops, or non-loop @@ -334,14 +437,14 @@ continue; } - Chain.push_back(II); + Slice.push_back(II); for (unsigned k = 0; k < II->getNumOperands(); ++k) if (auto *OpI = dyn_cast(II->getOperand(k))) Worklist.push(OpI); } } -bool SelectOptimize::isSelectHighlyPredictable(SelectInst *SI) { +bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) { uint64_t TrueWeight, FalseWeight; if (SI->extractProfMetadata(TrueWeight, FalseWeight)) { uint64_t Max = std::max(TrueWeight, FalseWeight); @@ -355,6 +458,160 @@ return false; } +bool SelectOptimize::checkLoopHeuristics(CostInfo LoopCost[2]) { + // Loop-level checks to determine if a non-predicated version (with branches) + // of the loop is more profitable than its predicated version. + uint64_t Diff[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost, + LoopCost[1].PredCost - LoopCost[1].NonPredCost}; + + // Profitably converting to branches need to reduce the loop's critical path + // by at least some threshold (absolute gain of GainCycleThreshold cycles and + // relative gain of 12.5%). + if (Diff[1] < GainCycleThreshold * ScalingUpFactor && + Diff[1] * 8 < LoopCost[1].PredCost) + return false; + + // If the loop's critical path involves loop-carried dependences, the gradient + // of the gain needs to be at least LoopGradientGain% (defaults to 25%). + if (Diff[1] > Diff[0] && + (Diff[1] - Diff[0]) * 100 < + (LoopCost[1].PredCost - LoopCost[0].PredCost) * LoopGradientGain) + return false; + + // Non-predicated version of the loop is more profitable than its + // predicated version. + return true; +} + +bool SelectOptimize::computeLoopCosts( + const Loop *L, const SelectGroups &SIGroups, + DenseMap &InstCostMap, + CostInfo LoopCost[2]) { + const auto &SIset = getSIset(SIGroups); + // Compute instruction and loop-critical-path costs across two iterations for + // both predicated and non-predicated version. + const unsigned Iterations = 2; + for (unsigned Iter = 0; Iter < Iterations; ++Iter) { + // Cost of the loop's critical path. + CostInfo &MaxCost = LoopCost[Iter]; + for (BasicBlock *BB : L->getBlocks()) { + for (const Instruction &I : *BB) { + if (I.isDebugOrPseudoInst()) + continue; + // Compute the predicated and non-predicated cost of the instruction. + uint64_t IPredCost = 0, INonPredCost = 0; + + // Assume infinite resources that allow to fully exploit the available + // instruction-level parallelism. + // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost) + for (const Use &U : I.operands()) { + auto UI = dyn_cast(U.get()); + if (!UI) + continue; + if (InstCostMap.count(UI)) { + IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost); + INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost); + } + } + auto ILatency = computeInstCost(&I); + if (!ILatency.hasValue()) + return false; + IPredCost += ScalingUpFactor * ILatency.getValue(); + INonPredCost += ScalingUpFactor * ILatency.getValue(); + + // For a select that can be converted to branch, + // compute its cost as a branch (non-predicated cost). + // + // BranchCost = PredictedPathCost + MispredictCost + // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb + // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate + if (SIset.contains(&I)) { + auto SI = dyn_cast(&I); + + uint64_t TrueOpCost = 0, FalseOpCost = 0; + if (auto *TI = dyn_cast(SI->getTrueValue())) + if (InstCostMap.count(TI)) + TrueOpCost = InstCostMap[TI].NonPredCost; + if (auto *FI = dyn_cast(SI->getFalseValue())) + if (InstCostMap.count(FI)) + FalseOpCost = InstCostMap[FI].NonPredCost; + uint64_t PredictedPathCost = + predictedPathCost(TrueOpCost, FalseOpCost, SI); + + uint64_t CondCost = 0; + if (auto *CI = dyn_cast(SI->getCondition())) + if (InstCostMap.count(CI)) + CondCost = InstCostMap[CI].NonPredCost; + uint64_t MispredictCost = getMispredictionCost(SI, CondCost); + + INonPredCost = PredictedPathCost + MispredictCost; + } + + InstCostMap[&I] = {IPredCost, INonPredCost}; + MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost); + MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost); + } + } + } + return true; +} + +SmallPtrSet +SelectOptimize::getSIset(const SelectGroups &SIGroups) { + SmallPtrSet SIset; + for (const SelectGroup &ASI : SIGroups) + for (const SelectInst *SI : ASI) + SIset.insert(SI); + return SIset; +} + +Optional SelectOptimize::computeInstCost(const Instruction *I) { + InstructionCost ICost = + TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency); + if (auto OC = ICost.getValue()) + return Optional(OC.getValue()); + return Optional(None); +} + +uint64_t SelectOptimize::getMispredictionCost(const SelectInst *SI, + const uint64_t CondCost) { + uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; + + // Account for the default misprediction rate when using a branch + // (conservatively set to 25% by default). + uint64_t MispredictRate = MispredictDefaultRate; + // If the select condition is obviously predictable, then the misprediction + // rate is zero. + if (isSelectHighlyPredictable(SI)) + MispredictRate = 0; + + // CondCost is included to account for cases where the computation of the + // condition is part of a long dependence chain (potentially loop-carried) + // that would delay detection of a misprediction and increase its cost. + uint64_t MispredictCost = divideCeil( + std::max(ScalingUpFactor * MispredictPenalty, CondCost) * MispredictRate, + 100); + return MispredictCost; +} + +// Returns the cost of a branch when the prediction is correct. +// TrueCost * TrueProbability + FalseCost * FalseProbability. +uint64_t SelectOptimize::predictedPathCost(uint64_t TrueCost, + uint64_t FalseCost, + const SelectInst *SI) { + uint64_t TrueWeight, FalseWeight; + if (SI->extractProfMetadata(TrueWeight, FalseWeight)) { + uint64_t SumWeight = TrueWeight + FalseWeight; + if (SumWeight != 0) + return divideCeil(TrueCost * TrueWeight + FalseCost * FalseWeight, + SumWeight); + } + // Without branch weight metadata, we assume 75% for the one path and 25% for + // the other, and pick the result with the biggest cost. + return std::max(divideCeil(TrueCost * 3 + FalseCost, 4), + divideCeil(FalseCost * 3 + TrueCost, 4)); +} + bool SelectOptimize::isExpensiveInst(Instruction *I) { return TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency) >= TargetTransformInfo::TCC_Expensive;