diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" @@ -30,6 +31,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/ScaledNumber.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include @@ -52,6 +54,8 @@ "Number of select groups not converted due to unpredictability"); STATISTIC(NumSelectColdBB, "Number of select groups not converted due to cold basic block"); +STATISTIC(NumSelectConvertedLoop, + "Number of select groups converted due to loop-level analysis"); STATISTIC(NumSelectsConverted, "Number of selects converted"); static cl::opt ColdOperandThreshold( @@ -65,6 +69,31 @@ "slice of a cold operand to be considered inexpensive."), cl::init(1), cl::Hidden); +static cl::opt + GainGradientThreshold("select-opti-loop-gradient-gain-threshold", + cl::desc("Gradient gain threshold (%)."), + cl::init(25), cl::Hidden); + +static cl::opt + GainCycleThreshold("select-opti-loop-cycle-gain-threshold", + cl::desc("Minimum gain per loop (in cycles) threshold."), + cl::init(4), cl::Hidden); + +static cl::opt GainRelativeThreshold( + "select-opti-loop-relative-gain-threshold", + cl::desc( + "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"), + cl::init(8), cl::Hidden); + +static cl::opt MispredictDefaultRate( + "mispredict-default-rate", cl::Hidden, cl::init(25), + cl::desc("Default mispredict rate (initialized to 25%).")); + +static cl::opt + DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden, + cl::init(false), + cl::desc("Disable loop-level heuristics.")); + namespace { class SelectOptimize : public FunctionPass { @@ -78,6 +107,7 @@ std::unique_ptr BPI; ProfileSummaryInfo *PSI; OptimizationRemarkEmitter *ORE; + TargetSchedModel TSchedModel; public: static char ID; @@ -103,6 +133,15 @@ using SelectGroup = SmallVector; using SelectGroups = SmallVector; + using Scaled64 = ScaledNumber; + + struct CostInfo { + /// Predicated cost (with selects as conditional moves). + Scaled64 PredCost; + /// Non-predicated cost (with selects converted to branches). + Scaled64 NonPredCost; + }; + // Converts select instructions of a function to conditional jumps when deemed // profitable. Returns true if at least one select was converted. bool optimizeSelects(Function &F); @@ -122,9 +161,12 @@ void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups); // Determines for which select groups it is profitable converting to branches - // (base heuristics). + // (base and inner-most-loop heuristics). void findProfitableSIGroupsBase(SelectGroups &SIGroups, SelectGroups &ProfSIGroups); + void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups, + SelectGroups &ProfSIGroups); + // Determines if a select group should be converted to a branch (base // heuristics). bool isConvertToBranchProfitableBase(const SmallVector &ASI); @@ -143,6 +185,29 @@ // Returns true if the condition of the select is highly predictable. bool isSelectHighlyPredictable(const SelectInst *SI); + // Loop-level checks to determine if a non-predicated version (with branches) + // of the given loop is more profitable than its predicated version. + bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]); + + // Computes instruction and loop-critical-path costs for both the predicated + // and non-predicated version of the given loop. + bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups, + DenseMap &InstCostMap, + CostInfo *LoopCost); + + // Returns a set of all the select instructions in the given select groups. + SmallPtrSet getSIset(const SelectGroups &SIGroups); + + // Returns the latency cost of a given instruction. + Optional computeInstCost(const Instruction *I); + + // Returns the misprediction cost of a given select when converted to branch. + Scaled64 getMispredictionCost(const SelectInst *SI, const Scaled64 CondCost); + + // Returns the cost of a branch when the prediction is correct. + Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, + const SelectInst *SI); + // Returns true if the target architecture supports lowering a given select. bool isSelectKindSupported(SelectInst *SI); }; @@ -183,6 +248,7 @@ BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI)); PSI = &getAnalysis().getPSI(); ORE = &getAnalysis().getORE(); + TSchedModel.init(TSI); // When optimizing for size, selects are preferable over branches. if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get())) @@ -224,7 +290,24 @@ } void SelectOptimize::optimizeSelectsInnerLoops(Function &F, - SelectGroups &ProfSIGroups) {} + SelectGroups &ProfSIGroups) { + SmallVector Loops(LI->begin(), LI->end()); + // Need to check size on each iteration as we accumulate child loops. + for (unsigned long i = 0; i < Loops.size(); ++i) + for (Loop *ChildL : Loops[i]->getSubLoops()) + Loops.push_back(ChildL); + + for (Loop *L : Loops) { + if (!L->isInnermost()) + continue; + + SelectGroups SIGroups; + for (BasicBlock *BB : L->getBlocks()) + collectSelectGroups(*BB, SIGroups); + + findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups); + } +} /// If \p isTrue is true, return the true value of \p SI, otherwise return /// false value of \p SI. If the true/false value of \p SI is defined by any @@ -376,6 +459,53 @@ } } +void SelectOptimize::findProfitableSIGroupsInnerLoops( + const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { + NumSelectOptAnalyzed += SIGroups.size(); + // For each select group in an inner-most loop, + // a branch is more preferable than a select/conditional-move if: + // i) conversion to branches for all the select groups of the loop satisfies + // loop-level heuristics including reducing the loop's critical path by + // some threshold (see SelectOptimize::checkLoopHeuristics); and + // ii) the total cost of the select group is cheaper with a branch compared + // to its predicated version. The cost is in terms of latency and the cost + // of a select group is the cost of its most expensive select instruction + // (assuming infinite resources and thus fully leveraging available ILP). + + DenseMap InstCostMap; + CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()}, + {Scaled64::getZero(), Scaled64::getZero()}}; + if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) || + !checkLoopHeuristics(L, LoopCost)) { + return; + } + + for (SelectGroup &ASI : SIGroups) { + // Assuming infinite resources, the cost of a group of instructions is the + // cost of the most expensive instruction of the group. + Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero(); + for (SelectInst *SI : ASI) { + SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost); + BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost); + } + if (BranchCost < SelectCost) { + OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front()); + OR << "Profitable to convert to branch (loop analysis). BranchCost=" + << BranchCost.toString() << ", SelectCost=" << SelectCost.toString() + << ". "; + ORE->emit(OR); + ++NumSelectConvertedLoop; + ProfSIGroups.push_back(ASI); + } else { + OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front()); + ORmiss << "Select is more profitable (loop analysis). BranchCost=" + << BranchCost.toString() + << ", SelectCost=" << SelectCost.toString() << ". "; + ORE->emit(ORmiss); + } + } +} + bool SelectOptimize::isConvertToBranchProfitableBase( const SmallVector &ASI) { SelectInst *SI = ASI.front(); @@ -529,6 +659,220 @@ return false; } +bool SelectOptimize::checkLoopHeuristics(const Loop *L, + const CostInfo LoopCost[2]) { + // Loop-level checks to determine if a non-predicated version (with branches) + // of the loop is more profitable than its predicated version. + + if (DisableLoopLevelHeuristics) + return true; + + OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", + L->getHeader()->getFirstNonPHI()); + + if (LoopCost[0].NonPredCost > LoopCost[0].PredCost || + LoopCost[1].NonPredCost >= LoopCost[1].PredCost) { + ORmissL << "No select conversion in the loop due to no reduction of loop's " + "critical path. "; + ORE->emit(ORmissL); + return false; + } + + Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost, + LoopCost[1].PredCost - LoopCost[1].NonPredCost}; + + // Profitably converting to branches need to reduce the loop's critical path + // by at least some threshold (absolute gain of GainCycleThreshold cycles and + // relative gain of 12.5%). + if (Gain[1] < Scaled64::get(GainCycleThreshold) || + Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) { + Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost; + ORmissL << "No select conversion in the loop due to small reduction of " + "loop's critical path. Gain=" + << Gain[1].toString() + << ", RelativeGain=" << RelativeGain.toString() << "%. "; + ORE->emit(ORmissL); + return false; + } + + // If the loop's critical path involves loop-carried dependences, the gradient + // of the gain needs to be at least GainGradientThreshold% (defaults to 25%). + // This check ensures that the latency reduction for the loop's critical path + // keeps decreasing with sufficient rate beyond the two analyzed loop + // iterations. + if (Gain[1] > Gain[0]) { + Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) / + (LoopCost[1].PredCost - LoopCost[0].PredCost); + if (GradientGain < Scaled64::get(GainGradientThreshold)) { + ORmissL << "No select conversion in the loop due to small gradient gain. " + "GradientGain=" + << GradientGain.toString() << "%. "; + ORE->emit(ORmissL); + return false; + } + } + // If the gain decreases it is not profitable to convert. + else if (Gain[1] < Gain[0]) { + ORmissL + << "No select conversion in the loop due to negative gradient gain. "; + ORE->emit(ORmissL); + return false; + } + + // Non-predicated version of the loop is more profitable than its + // predicated version. + return true; +} + +// Computes instruction and loop-critical-path costs for both the predicated +// and non-predicated version of the given loop. +// Returns false if unable to compute these costs due to invalid cost of loop +// instruction(s). +bool SelectOptimize::computeLoopCosts( + const Loop *L, const SelectGroups &SIGroups, + DenseMap &InstCostMap, CostInfo *LoopCost) { + const auto &SIset = getSIset(SIGroups); + // Compute instruction and loop-critical-path costs across two iterations for + // both predicated and non-predicated version. + const unsigned Iterations = 2; + for (unsigned Iter = 0; Iter < Iterations; ++Iter) { + // Cost of the loop's critical path. + CostInfo &MaxCost = LoopCost[Iter]; + for (BasicBlock *BB : L->getBlocks()) { + for (const Instruction &I : *BB) { + if (I.isDebugOrPseudoInst()) + continue; + // Compute the predicated and non-predicated cost of the instruction. + Scaled64 IPredCost = Scaled64::getZero(), + INonPredCost = Scaled64::getZero(); + + // Assume infinite resources that allow to fully exploit the available + // instruction-level parallelism. + // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost) + for (const Use &U : I.operands()) { + auto UI = dyn_cast(U.get()); + if (!UI) + continue; + if (InstCostMap.count(UI)) { + IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost); + INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost); + } + } + auto ILatency = computeInstCost(&I); + if (!ILatency.hasValue()) { + OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I); + ORmissL << "Invalid instruction cost preventing analysis and " + "optimization of the inner-most loop containing this " + "instruction. "; + ORE->emit(ORmissL); + return false; + } + IPredCost += Scaled64::get(ILatency.getValue()); + INonPredCost += Scaled64::get(ILatency.getValue()); + + // For a select that can be converted to branch, + // compute its cost as a branch (non-predicated cost). + // + // BranchCost = PredictedPathCost + MispredictCost + // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb + // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate + if (SIset.contains(&I)) { + auto SI = dyn_cast(&I); + + Scaled64 TrueOpCost = Scaled64::getZero(), + FalseOpCost = Scaled64::getZero(); + if (auto *TI = dyn_cast(SI->getTrueValue())) + if (InstCostMap.count(TI)) + TrueOpCost = InstCostMap[TI].NonPredCost; + if (auto *FI = dyn_cast(SI->getFalseValue())) + if (InstCostMap.count(FI)) + FalseOpCost = InstCostMap[FI].NonPredCost; + Scaled64 PredictedPathCost = + getPredictedPathCost(TrueOpCost, FalseOpCost, SI); + + Scaled64 CondCost = Scaled64::getZero(); + if (auto *CI = dyn_cast(SI->getCondition())) + if (InstCostMap.count(CI)) + CondCost = InstCostMap[CI].NonPredCost; + Scaled64 MispredictCost = getMispredictionCost(SI, CondCost); + + INonPredCost = PredictedPathCost + MispredictCost; + } + + InstCostMap[&I] = {IPredCost, INonPredCost}; + MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost); + MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost); + } + } + } + return true; +} + +SmallPtrSet +SelectOptimize::getSIset(const SelectGroups &SIGroups) { + SmallPtrSet SIset; + for (const SelectGroup &ASI : SIGroups) + for (const SelectInst *SI : ASI) + SIset.insert(SI); + return SIset; +} + +Optional SelectOptimize::computeInstCost(const Instruction *I) { + InstructionCost ICost = + TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency); + if (auto OC = ICost.getValue()) + return Optional(OC.getValue()); + return Optional(None); +} + +ScaledNumber +SelectOptimize::getMispredictionCost(const SelectInst *SI, + const Scaled64 CondCost) { + uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; + + // Account for the default misprediction rate when using a branch + // (conservatively set to 25% by default). + uint64_t MispredictRate = MispredictDefaultRate; + // If the select condition is obviously predictable, then the misprediction + // rate is zero. + if (isSelectHighlyPredictable(SI)) + MispredictRate = 0; + + // CondCost is included to account for cases where the computation of the + // condition is part of a long dependence chain (potentially loop-carried) + // that would delay detection of a misprediction and increase its cost. + Scaled64 MispredictCost = + std::max(Scaled64::get(MispredictPenalty), CondCost) * + Scaled64::get(MispredictRate); + MispredictCost /= Scaled64::get(100); + + return MispredictCost; +} + +// Returns the cost of a branch when the prediction is correct. +// TrueCost * TrueProbability + FalseCost * FalseProbability. +ScaledNumber +SelectOptimize::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, + const SelectInst *SI) { + Scaled64 PredPathCost; + uint64_t TrueWeight, FalseWeight; + if (SI->extractProfMetadata(TrueWeight, FalseWeight)) { + uint64_t SumWeight = TrueWeight + FalseWeight; + if (SumWeight != 0) { + PredPathCost = TrueCost * Scaled64::get(TrueWeight) + + FalseCost * Scaled64::get(FalseWeight); + PredPathCost /= Scaled64::get(SumWeight); + return PredPathCost; + } + } + // Without branch weight metadata, we assume 75% for the one path and 25% for + // the other, and pick the result with the biggest cost. + PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost, + FalseCost * Scaled64::get(3) + TrueCost); + PredPathCost /= Scaled64::get(4); + return PredPathCost; +} + bool SelectOptimize::isSelectKindSupported(SelectInst *SI) { bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); if (VectorCond) diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll --- a/llvm/test/CodeGen/X86/select-optimize.ll +++ b/llvm/test/CodeGen/X86/select-optimize.ll @@ -211,9 +211,301 @@ ret i32 %add } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Test loop heuristic: loop-level critical-path analysis +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Use of cmov in this test would put a load and a fsub on the critical path. +;; Loop-level analysis should decide to form a branch. +;; +;;double cmov_on_critical_path(int n, double x, double *a) { +;; for (int i = 0; i < n; i++) { +;; double r = a[i]; +;; if (x > r) +;; // 50% of iterations +;; x -= r; +;; } +;; return x; +;;} +define double @cmov_on_critical_path(i32 %n, double %x, double* nocapture %a) { +; CHECK-LABEL: @cmov_on_critical_path( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret double [[X:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[SELECT_END:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[X1:%.*]] = phi double [ [[X2:%.*]], [[SELECT_END]] ], [ [[X]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[R:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[X1]], [[R]] +; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt double [[X1]], [[R]] +; CHECK-NEXT: [[X2_FROZEN:%.*]] = freeze i1 [[CMP2]] +; CHECK-NEXT: br i1 [[X2_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]], !prof [[PROF27:![0-9]+]] +; CHECK: select.false: +; CHECK-NEXT: br label [[SELECT_END]] +; CHECK: select.end: +; CHECK-NEXT: [[X2]] = phi double [ [[SUB]], [[FOR_BODY]] ], [ [[X1]], [[SELECT_FALSE]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] +; CHECK: for.exit: +; CHECK-NEXT: ret double [[X2]] +; +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %entry + ret double %x + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %x1 = phi double [ %x2, %for.body ], [ %x, %for.body.preheader ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv + %r = load double, double* %arrayidx, align 8 + %sub = fsub double %x1, %r + %cmp2 = fcmp ogt double %x1, %r + %x2 = select i1 %cmp2, double %sub, double %x1, !prof !18 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret double %x2 +} + +;; The common path includes expensive operations (load and fsub) making +;; branch similarly expensive to cmov, and thus the gain is small. +;; Loop-level analysis should decide on not forming a branch. +;; +;;double small_gain(int n, double x, double *a) { +;; for (int i = 0; i < n; i++) { +;; double r = a[i]; +;; if (x > r) +;; // 99% of iterations +;; x -= r; +;; } +;; return x; +;;} +define double @small_gain(i32 %n, double %x, double* nocapture %a) { +; CHECK-LABEL: @small_gain( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret double [[X:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[X1:%.*]] = phi double [ [[X2:%.*]], [[FOR_BODY]] ], [ [[X]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[R:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[X1]], [[R]] +; CHECK-NEXT: [[CMP2:%.*]] = fcmp ole double [[X1]], [[R]] +; CHECK-NEXT: [[X2]] = select i1 [[CMP2]], double [[X1]], double [[SUB]], !prof [[PROF18]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] +; CHECK: for.exit: +; CHECK-NEXT: ret double [[X2]] +; +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %entry + ret double %x + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %x1 = phi double [ %x2, %for.body ], [ %x, %for.body.preheader ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv + %r = load double, double* %arrayidx, align 8 + %sub = fsub double %x1, %r + %cmp2 = fcmp ole double %x1, %r + %x2 = select i1 %cmp2, double %x1, double %sub, !prof !17 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret double %x2 +} + +;; Use of a branch in this test would avoid executing a load and several +;; floating-point operations for most cases (70% of the time). +;; Yet, the gain is not increasing much per iteration (small gradient gain). +;; Loop-level analysis should decide not to form a branch. +;; +;;double small_gradient(int n, double x, double *a) { +;; for (int i = 0; i < n; i++) { +;; double r = 2 * a[i] + i; +;; if (r > 0) +;; // 30% of iterations +;; x -= r; +;; } +;; return x; +;;} +define double @small_gradient(i32 %n, double %x, double* nocapture %a) { +; CHECK-LABEL: @small_gradient( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[X_ADDR_0_LCSSA:%.*]] = phi double [ [[X:%.*]], [[ENTRY:%.*]] ], [ [[X_ADDR_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: ret double [[X_ADDR_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[X_ADDR_010:%.*]] = phi double [ [[X]], [[FOR_BODY_PREHEADER]] ], [ [[X_ADDR_1]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.fmuladd.f64(double [[TMP0]], double 2.000000e+00, double 1.000000e+00) +; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt double [[TMP1]], 0.000000e+00 +; CHECK-NEXT: [[SUB:%.*]] = select i1 [[CMP1]], double [[TMP1]], double 0.000000e+00, !prof [[PROF28:![0-9]+]] +; CHECK-NEXT: [[X_ADDR_1]] = fsub double [[X_ADDR_010]], [[SUB]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; +entry: + %cmp8 = icmp sgt i32 %n, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %x.addr.0.lcssa = phi double [ %x, %entry ], [ %x.addr.1, %for.body ] + ret double %x.addr.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %x.addr.010 = phi double [ %x, %for.body.preheader ], [ %x.addr.1, %for.body ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 8 + %1 = call double @llvm.fmuladd.f64(double %0, double 2.000000e+00, double 1.000000e+00) + %cmp1 = fcmp ogt double %1, 0.000000e+00 + %sub = select i1 %cmp1, double %1, double 0.000000e+00, !prof !28 + %x.addr.1 = fsub double %x.addr.010, %sub + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +;; One select on the critical path and one off the critical path. +;; Loop-level analysis should decide to form a branch only for +;; the select on the critical path. +;; +;;double loop_select_groups(int n, double x, double *a, int k) { +;; int c = 0; +;; for (int i = 0; i < n; i++) { +;; double r = a[i]; +;; if (x > r) +;; x -= r; +;; if (i == k) +;; c += n; +;; } +;; return x + c; +;;} +define double @loop_select_groups(i32 %n, double %x, double* nocapture %a, i32 %k) { +; CHECK-LABEL: @loop_select_groups( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: [[PHI_CAST:%.*]] = sitofp i32 [[C_1:%.*]] to double +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[C_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[PHI_CAST]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[X_ADDR_0_LCSSA:%.*]] = phi double [ [[X:%.*]], [[ENTRY]] ], [ [[X_ADDR_1:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: [[ADD5:%.*]] = fadd double [[X_ADDR_0_LCSSA]], [[C_0_LCSSA]] +; CHECK-NEXT: ret double [[ADD5]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[SELECT_END:%.*]] ] +; CHECK-NEXT: [[X_ADDR_022:%.*]] = phi double [ [[X]], [[FOR_BODY_PREHEADER]] ], [ [[X_ADDR_1]], [[SELECT_END]] ] +; CHECK-NEXT: [[C_020:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[C_1]], [[SELECT_END]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt double [[X_ADDR_022]], [[TMP0]] +; CHECK-NEXT: [[SUB_FROZEN:%.*]] = freeze i1 [[CMP1]] +; CHECK-NEXT: br i1 [[SUB_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]] +; CHECK: select.false: +; CHECK-NEXT: br label [[SELECT_END]] +; CHECK: select.end: +; CHECK-NEXT: [[SUB:%.*]] = phi double [ [[TMP0]], [[FOR_BODY]] ], [ 0.000000e+00, [[SELECT_FALSE]] ] +; CHECK-NEXT: [[X_ADDR_1]] = fsub double [[X_ADDR_022]], [[SUB]] +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[K:%.*]], [[N]] +; CHECK-NEXT: [[ADD:%.*]] = select i1 [[CMP2]], i32 [[N]], i32 0 +; CHECK-NEXT: [[C_1]] = add nsw i32 [[ADD]], [[C_020]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] +; +entry: + %cmp19 = icmp sgt i32 %n, 0 + br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %phi.cast = sitofp i32 %c.1 to double + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %c.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %phi.cast, %for.cond.cleanup.loopexit ] + %x.addr.0.lcssa = phi double [ %x, %entry ], [ %x.addr.1, %for.cond.cleanup.loopexit ] + %add5 = fadd double %x.addr.0.lcssa, %c.0.lcssa + ret double %add5 + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %x.addr.022 = phi double [ %x, %for.body.preheader ], [ %x.addr.1, %for.body ] + %c.020 = phi i32 [ 0, %for.body.preheader ], [ %c.1, %for.body ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 8 + %cmp1 = fcmp ogt double %x.addr.022, %0 + %sub = select i1 %cmp1, double %0, double 0.000000e+00 + %x.addr.1 = fsub double %x.addr.022, %sub + %1 = trunc i64 %indvars.iv to i32 + %cmp2 = icmp eq i32 %k, %n + %add = select i1 %cmp2, i32 %n, i32 0 + %c.1 = add nsw i32 %add, %c.020 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + ; Function Attrs: nounwind readnone speculatable willreturn declare void @llvm.dbg.value(metadata, metadata, metadata) +; Function Attrs: mustprogress nofree nosync nounwind readnone speculatable willreturn +declare double @llvm.fmuladd.f64(double, double, double) + !llvm.module.flags = !{!0, !26, !27} !0 = !{i32 1, !"ProfileSummary", !1} !1 = !{!2, !3, !4, !5, !6, !7, !8, !9} @@ -243,3 +535,4 @@ !25 = !{} !26 = !{i32 2, !"Dwarf Version", i32 4} !27 = !{i32 1, !"Debug Info Version", i32 3} +!28 = !{!"branch_weights", i32 30, i32 70}