diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -15,37 +15,73 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/SizeOpts.h" +#include +#include +#include +#include +#include using namespace llvm; #define DEBUG_TYPE "select-optimize" +STATISTIC(NumSelectOptAnalyzed, + "Number of select groups considered for conversion to branch"); +STATISTIC(NumSelectConvertedExpColdOperand, + "Number of select groups converted due to expensive cold operand"); +STATISTIC(NumSelectConvertedHighPred, + "Number of select groups converted due to high-predictability"); +STATISTIC(NumSelectUnPred, + "Number of select groups not converted due to unpredictability"); +STATISTIC(NumSelectColdBB, + "Number of select groups not converted due to cold basic block"); STATISTIC(NumSelectsConverted, "Number of selects converted"); +static cl::opt ColdOperandThreshold( + "cold-operand-threshold", + cl::desc("Maximum frequency of path for an operand to be considered cold."), + cl::init(20), cl::Hidden); + +static cl::opt ColdOperandMaxCostMultiplier( + "cold-operand-max-cost-multiplier", + cl::desc("Maximum cost multiplier of TCC_expensive for the dependence " + "slice of a cold operand to be considered inexpensive."), + cl::init(1), cl::Hidden); + namespace { class SelectOptimize : public FunctionPass { const TargetMachine *TM = nullptr; const TargetSubtargetInfo *TSI; const TargetLowering *TLI = nullptr; + const TargetTransformInfo *TTI = nullptr; const LoopInfo *LI; + DominatorTree *DT; std::unique_ptr BFI; std::unique_ptr BPI; + ProfileSummaryInfo *PSI; + OptimizationRemarkEmitter *ORE; public: static char ID; + SelectOptimize() : FunctionPass(ID) { initializeSelectOptimizePass(*PassRegistry::getPassRegistry()); } @@ -53,8 +89,12 @@ bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addRequired(); + AU.addRequired(); } private: @@ -63,9 +103,47 @@ using SelectGroup = SmallVector; using SelectGroups = SmallVector; + // Converts select instructions of a function to conditional jumps when deemed + // profitable. Returns true if at least one select was converted. bool optimizeSelects(Function &F); + + // Heuristics for determining which select instructions can be profitably + // conveted to branches. Separate heuristics for selects in inner-most loops + // and the rest of code regions (base heuristics for non-inner-most loop + // regions). + void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups); + void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups); + + // Converts to branches the select groups that were deemed + // profitable-to-convert. void convertProfitableSIGroups(SelectGroups &ProfSIGroups); + + // Splits selects of a given basic block into select groups. void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups); + + // Determines for which select groups it is profitable converting to branches + // (base heuristics). + void findProfitableSIGroupsBase(SelectGroups &SIGroups, + SelectGroups &ProfSIGroups); + // Determines if a select group should be converted to a branch (base + // heuristics). + bool isConvertToBranchProfitableBase(const SmallVector &ASI); + + // Returns true if there are expensive instructions in the cold value + // operand's (if any) dependence slice of any of the selects of the given + // group. + bool hasExpensiveColdOperand(const SmallVector &ASI); + + // For a given source instruction, collect its backwards dependence slice + // consisting of instructions exclusively computed for producing the operands + // of the source instruction. + void getExclBackwardsSlice(Instruction *I, + SmallVector &Slice); + + // Returns true if the condition of the select is highly predictable. + bool isSelectHighlyPredictable(const SelectInst *SI); + + // Returns true if the target architecture supports lowering a given select. bool isSelectKindSupported(SelectInst *SI); }; } // namespace @@ -75,7 +153,11 @@ INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, false) @@ -85,27 +167,37 @@ TM = &getAnalysis().getTM(); TSI = TM->getSubtargetImpl(F); TLI = TSI->getTargetLowering(); + + // If none of the select types is supported then skip this pass. + // This is an optimization pass. Legality issues will be handled by + // instruction selection. + if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && + !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && + !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) + return false; + + TTI = &getAnalysis().getTTI(F); + DT = &getAnalysis().getDomTree(); LI = &getAnalysis().getLoopInfo(); BPI.reset(new BranchProbabilityInfo(F, *LI)); BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI)); + PSI = &getAnalysis().getPSI(); + ORE = &getAnalysis().getORE(); + + // When optimizing for size, selects are preferable over branches. + if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get())) + return false; return optimizeSelects(F); } bool SelectOptimize::optimizeSelects(Function &F) { - // Collect all the select groups. - SelectGroups SIGroups; - for (BasicBlock &BB : F) { - collectSelectGroups(BB, SIGroups); - } - // Determine for which select groups it is profitable converting to branches. SelectGroups ProfSIGroups; - // For now assume that all select groups can be profitably converted to - // branches. - for (SelectGroup &ASI : SIGroups) { - ProfSIGroups.push_back(ASI); - } + // Base heuristics apply only to non-loops and outer loops. + optimizeSelectsBase(F, ProfSIGroups); + // Separate heuristics for inner-most loops. + optimizeSelectsInnerLoops(F, ProfSIGroups); // Convert to branches the select groups that were deemed // profitable-to-convert. @@ -115,6 +207,25 @@ return !ProfSIGroups.empty(); } +void SelectOptimize::optimizeSelectsBase(Function &F, + SelectGroups &ProfSIGroups) { + // Collect all the select groups. + SelectGroups SIGroups; + for (BasicBlock &BB : F) { + // Base heuristics apply only to non-loops and outer loops. + Loop *L = LI->getLoopFor(&BB); + if (L && L->isInnermost()) + continue; + collectSelectGroups(BB, SIGroups); + } + + // Determine for which select groups it is profitable converting to branches. + findProfitableSIGroupsBase(SIGroups, ProfSIGroups); +} + +void SelectOptimize::optimizeSelectsInnerLoops(Function &F, + SelectGroups &ProfSIGroups) {} + /// If \p isTrue is true, return the true value of \p SI, otherwise return /// false value of \p SI. If the true/false value of \p SI is defined by any /// select instructions in \p Selects, look through the defining select @@ -256,6 +367,168 @@ } } +void SelectOptimize::findProfitableSIGroupsBase(SelectGroups &SIGroups, + SelectGroups &ProfSIGroups) { + for (SelectGroup &ASI : SIGroups) { + ++NumSelectOptAnalyzed; + if (isConvertToBranchProfitableBase(ASI)) + ProfSIGroups.push_back(ASI); + } +} + +bool SelectOptimize::isConvertToBranchProfitableBase( + const SmallVector &ASI) { + SelectInst *SI = ASI.front(); + OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI); + OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI); + + // Skip cold basic blocks. Better to optimize for size for cold blocks. + if (PSI->isColdBlock(SI->getParent(), BFI.get())) { + ++NumSelectColdBB; + ORmiss << "Not converted to branch because of cold basic block. "; + ORE->emit(ORmiss); + return false; + } + + // If unpredictable, branch form is less profitable. + if (SI->getMetadata(LLVMContext::MD_unpredictable)) { + ++NumSelectUnPred; + ORmiss << "Not converted to branch because of unpredictable branch. "; + ORE->emit(ORmiss); + return false; + } + + // If highly predictable, branch form is more profitable, unless a + // predictable select is inexpensive in the target architecture. + if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) { + ++NumSelectConvertedHighPred; + OR << "Converted to branch because of highly predictable branch. "; + ORE->emit(OR); + return true; + } + + // Look for expensive instructions in the cold operand's (if any) dependence + // slice of any of the selects in the group. + if (hasExpensiveColdOperand(ASI)) { + ++NumSelectConvertedExpColdOperand; + OR << "Converted to branch because of expensive cold operand."; + ORE->emit(OR); + return true; + } + + ORmiss << "Not profitable to convert to branch (base heuristic)."; + ORE->emit(ORmiss); + return false; +} + +static InstructionCost divideNearest(InstructionCost Numerator, + uint64_t Denominator) { + return (Numerator + (Denominator / 2)) / Denominator; +} + +bool SelectOptimize::hasExpensiveColdOperand( + const SmallVector &ASI) { + bool ColdOperand = false; + uint64_t TrueWeight, FalseWeight, TotalWeight; + if (ASI.front()->extractProfMetadata(TrueWeight, FalseWeight)) { + uint64_t MinWeight = std::min(TrueWeight, FalseWeight); + TotalWeight = TrueWeight + FalseWeight; + // Is there a path with frequency 100 * MinWeight; + } else if (PSI->hasProfileSummary()) { + OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front()); + ORmiss << "Profile data available but missing branch-weights metadata for " + "select instruction. "; + ORE->emit(ORmiss); + } + if (!ColdOperand) + return false; + // Check if the cold path's dependence slice is expensive for any of the + // selects of the group. + for (SelectInst *SI : ASI) { + Instruction *ColdI = nullptr; + uint64_t HotWeight; + if (TrueWeight < FalseWeight) { + ColdI = dyn_cast(SI->getTrueValue()); + HotWeight = FalseWeight; + } else { + ColdI = dyn_cast(SI->getFalseValue()); + HotWeight = TrueWeight; + } + if (ColdI) { + SmallVector ColdSlice; + getExclBackwardsSlice(ColdI, ColdSlice); + InstructionCost SliceCost = 0; + for (auto *ColdII : ColdSlice) { + SliceCost += + TTI->getInstructionCost(ColdII, TargetTransformInfo::TCK_Latency); + } + // The colder the cold value operand of the select is the more expensive + // the cmov becomes for computing the cold value operand every time. Thus, + // the colder the cold operand is the more its cost counts. + // Get nearest integer cost adjusted for coldness. + InstructionCost AdjSliceCost = + divideNearest(SliceCost * HotWeight, TotalWeight); + if (AdjSliceCost >= + ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive) + return true; + } + } + return false; +} + +// For a given source instruction, collect its backwards dependence slice +// consisting of instructions exclusively computed for the purpose of producing +// the operands of the source instruction. As an approximation +// (sufficiently-accurate in practice), we populate this set with the +// instructions of the backwards dependence slice that only have one-use and +// form an one-use chain that leads to the source instruction. +void SelectOptimize::getExclBackwardsSlice( + Instruction *I, SmallVector &Slice) { + SmallPtrSet Visited; + std::queue Worklist; + Worklist.push(I); + while (!Worklist.empty()) { + Instruction *II = Worklist.front(); + Worklist.pop(); + + // Avoid cycles. + if (Visited.count(II)) + continue; + Visited.insert(II); + + if (!II->hasOneUse()) + continue; + + // Avoid considering instructions with less frequency than the source + // instruction (i.e., avoid colder code regions of the dependence slice). + if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent())) + continue; + + // Eligible one-use instruction added to the dependence slice. + Slice.push_back(II); + + // Explore all the operands of the current instruction to expand the slice. + for (unsigned k = 0; k < II->getNumOperands(); ++k) + if (auto *OpI = dyn_cast(II->getOperand(k))) + Worklist.push(OpI); + } +} + +bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) { + uint64_t TrueWeight, FalseWeight; + if (SI->extractProfMetadata(TrueWeight, FalseWeight)) { + uint64_t Max = std::max(TrueWeight, FalseWeight); + uint64_t Sum = TrueWeight + FalseWeight; + if (Sum != 0) { + auto Probability = BranchProbability::getBranchProbability(Max, Sum); + if (Probability > TTI->getPredictableBranchThreshold()) + return true; + } + } + return false; +} + bool SelectOptimize::isSelectKindSupported(SelectInst *SI) { bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); if (VectorCond) diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll --- a/llvm/test/CodeGen/X86/select-optimize.ll +++ b/llvm/test/CodeGen/X86/select-optimize.ll @@ -1,47 +1,133 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=x86_64-unknown-unknown -select-optimize -S < %s | FileCheck %s -; Single select converted to branch -define i32 @single_select(i32 %a, i32 %b, i1 %cmp) { -; CHECK-LABEL: @single_select( +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Test base heuristic 1: +;; highly-biased selects assumed to be highly predictable, converted to branches +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; If a select is obviously predictable, turn it into a branch. +define i32 @weighted_select1(i32 %a, i32 %b, i1 %cmp) { +; CHECK-LABEL: @weighted_select1( ; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] -; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF2:![0-9]+]] +; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16:![0-9]+]] ; CHECK: select.false: ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: ; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[A:%.*]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ] ; CHECK-NEXT: ret i32 [[SEL]] ; - %sel = select i1 %cmp, i32 %a, i32 %b, !prof !0 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15 + ret i32 %sel +} + +; If a select is obviously predictable (reversed profile weights), +; turn it into a branch. +define i32 @weighted_select2(i32 %a, i32 %b, i1 %cmp) { +; CHECK-LABEL: @weighted_select2( +; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] +; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF17:![0-9]+]] +; CHECK: select.false: +; CHECK-NEXT: br label [[SELECT_END]] +; CHECK: select.end: +; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[A:%.*]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ] +; CHECK-NEXT: ret i32 [[SEL]] +; + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16 + ret i32 %sel +} + +; Not obvioulsy predictable select. +define i32 @weighted_select3(i32 %a, i32 %b, i1 %cmp) { +; CHECK-LABEL: @weighted_select3( +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], !prof [[PROF18:![0-9]+]] +; CHECK-NEXT: ret i32 [[SEL]] +; + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !17 + ret i32 %sel +} + +; Unpredictable select should not form a branch. +define i32 @unpred_select(i32 %a, i32 %b, i1 %cmp) { +; CHECK-LABEL: @unpred_select( +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], !unpredictable !19 +; CHECK-NEXT: ret i32 [[SEL]] +; + %sel = select i1 %cmp, i32 %a, i32 %b, !unpredictable !20 + ret i32 %sel +} + +; Predictable select in function with optsize attribute should not form branch. +define i32 @weighted_select_optsize(i32 %a, i32 %b, i1 %cmp) optsize { +; CHECK-LABEL: @weighted_select_optsize( +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], !prof [[PROF16]] +; CHECK-NEXT: ret i32 [[SEL]] +; + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15 ret i32 %sel } -; Select group converted to branch -define i32 @select_group(i32 %a, i32 %b, i32 %c, i1 %cmp) { -; CHECK-LABEL: @select_group( +define i32 @weighted_select_pgso(i32 %a, i32 %b, i1 %cmp) !prof !14 { +; CHECK-LABEL: @weighted_select_pgso( +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], !prof [[PROF16]] +; CHECK-NEXT: ret i32 [[SEL]] +; + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15 + ret i32 %sel +} + +; If two selects in a row are predictable, turn them into branches. +define i32 @weighted_selects(i32 %a, i32 %b) !prof !19 { +; CHECK-LABEL: @weighted_selects( +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[A:%.*]], 0 +; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP]] +; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16]] +; CHECK: select.false: +; CHECK-NEXT: br label [[SELECT_END]] +; CHECK: select.end: +; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[A]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[SEL]], 0 +; CHECK-NEXT: [[SEL1_FROZEN:%.*]] = freeze i1 [[CMP1]] +; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END1:%.*]], label [[SELECT_FALSE2:%.*]], !prof [[PROF16]] +; CHECK: select.false2: +; CHECK-NEXT: br label [[SELECT_END1]] +; CHECK: select.end1: +; CHECK-NEXT: [[SEL1:%.*]] = phi i32 [ [[B]], [[SELECT_END]] ], [ [[A]], [[SELECT_FALSE2]] ] +; CHECK-NEXT: ret i32 [[SEL1]] +; + %cmp = icmp ne i32 %a, 0 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15 + %cmp1 = icmp ne i32 %sel, 0 + %sel1 = select i1 %cmp1, i32 %b, i32 %a, !prof !15 + ret i32 %sel1 +} + +; If select group predictable, turn it into a branch. +define i32 @weighted_select_group(i32 %a, i32 %b, i32 %c, i1 %cmp) !prof !19 { +; CHECK-LABEL: @weighted_select_group( ; CHECK-NEXT: [[SEL1_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] -; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16]] ; CHECK: select.false: ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: ; CHECK-NEXT: [[SEL1:%.*]] = phi i32 [ [[A:%.*]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ] ; CHECK-NEXT: [[SEL2:%.*]] = phi i32 [ [[C:%.*]], [[TMP0]] ], [ [[A]], [[SELECT_FALSE]] ] -; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[SEL1]], metadata [[META3:![0-9]+]], metadata !DIExpression()), !dbg [[DBG8:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[SEL1]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26:![0-9]+]] ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEL1]], [[SEL2]] ; CHECK-NEXT: ret i32 [[ADD]] ; - %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !0 - call void @llvm.dbg.value(metadata i32 %sel1, metadata !4, metadata !DIExpression()), !dbg !DILocation(scope: !3) - %sel2 = select i1 %cmp, i32 %c, i32 %a, !prof !0 + %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !15 + call void @llvm.dbg.value(metadata i32 %sel1, metadata !24, metadata !DIExpression()), !dbg !DILocation(scope: !23) + %sel2 = select i1 %cmp, i32 %c, i32 %a, !prof !15 %add = add i32 %sel1, %sel2 ret i32 %add } -; Select group with intra-group dependence converted to branch +; Predictable select group with intra-group dependence converted to branch define i32 @select_group_intra_group(i32 %a, i32 %b, i32 %c, i1 %cmp) { ; CHECK-LABEL: @select_group_intra_group( ; CHECK-NEXT: [[SEL1_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] -; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF2]] +; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16]] ; CHECK: select.false: ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: @@ -50,22 +136,110 @@ ; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[SEL1]], [[SEL2]] ; CHECK-NEXT: ret i32 [[SUB]] ; - %sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !0 - %sel2 = select i1 %cmp, i32 %c, i32 %sel1, !prof !0 + %sel1 = select i1 %cmp, i32 %a, i32 %b,!prof !15 + %sel2 = select i1 %cmp, i32 %c, i32 %sel1, !prof !15 %sub = sub i32 %sel1, %sel2 ret i32 %sub } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Test base heuristic 2: +;; look for expensive instructions in the one-use slice of the cold path +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Select with cold one-use load value operand should form branch and +; sink load +define i32 @expensive_val_operand1(i32* nocapture %a, i32 %y, i1 %cmp) { +; CHECK-LABEL: @expensive_val_operand1( +; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8 +; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] +; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]] +; CHECK: select.false: +; CHECK-NEXT: br label [[SELECT_END]] +; CHECK: select.end: +; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[LOAD]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ] +; CHECK-NEXT: ret i32 [[SEL]] +; + %load = load i32, i32* %a, align 8 + %sel = select i1 %cmp, i32 %load, i32 %y, !prof !17 + ret i32 %sel +} + +; Expensive hot value operand and cheap cold value operand. +define i32 @expensive_val_operand2(i32* nocapture %a, i32 %x, i1 %cmp) { +; CHECK-LABEL: @expensive_val_operand2( +; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[X:%.*]], i32 [[LOAD]], !prof [[PROF18]] +; CHECK-NEXT: ret i32 [[SEL]] +; + %load = load i32, i32* %a, align 8 + %sel = select i1 %cmp, i32 %x, i32 %load, !prof !17 + ret i32 %sel +} + +; Cold value operand with load in its one-use dependence slice shoud result +; into a branch with sinked dependence slice. +define i32 @expensive_val_operand3(i32* nocapture %a, i32 %b, i32 %y, i1 %cmp) { +; CHECK-LABEL: @expensive_val_operand3( +; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8 +; CHECK-NEXT: [[X:%.*]] = add i32 [[LOAD]], [[B:%.*]] +; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] +; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]] +; CHECK: select.false: +; CHECK-NEXT: br label [[SELECT_END]] +; CHECK: select.end: +; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ] +; CHECK-NEXT: ret i32 [[SEL]] +; + %load = load i32, i32* %a, align 8 + %x = add i32 %load, %b + %sel = select i1 %cmp, i32 %x, i32 %y, !prof !17 + ret i32 %sel +} + +; Multiple uses of the load value operand. +define i32 @expensive_val_operand4(i32 %a, i32* nocapture %b, i32 %x, i1 %cmp) { +; CHECK-LABEL: @expensive_val_operand4( +; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[B:%.*]], align 4 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[X:%.*]], i32 [[LOAD]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEL]], [[LOAD]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %load = load i32, i32* %b, align 4 + %sel = select i1 %cmp, i32 %x, i32 %load + %add = add i32 %sel, %load + ret i32 %add +} + ; Function Attrs: nounwind readnone speculatable willreturn declare void @llvm.dbg.value(metadata, metadata, metadata) -!llvm.module.flags = !{!6, !7} - -!0 = !{!"branch_weights", i32 1, i32 100} -!1 = !DIFile(filename: "test.c", directory: "/test") -!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 15.0.0", isOptimized: true, emissionKind: FullDebug, globals: !5, splitDebugInlining: false, nameTableKind: None) -!3 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, unit: !2) -!4 = !DILocalVariable(name: "x", scope: !3) -!5 = !{} -!6 = !{i32 2, !"Dwarf Version", i32 4} -!7 = !{i32 1, !"Debug Info Version", i32 3} +!llvm.module.flags = !{!0, !26, !27} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} +!15 = !{!"branch_weights", i32 1, i32 100} +!16 = !{!"branch_weights", i32 100, i32 1} +!17 = !{!"branch_weights", i32 1, i32 99} +!18 = !{!"branch_weights", i32 50, i32 50} +!19 = !{!"function_entry_count", i64 100} +!20 = !{} +!21 = !DIFile(filename: "test.c", directory: "/test") +!22 = distinct !DICompileUnit(language: DW_LANG_C99, file: !21, producer: "clang version 15.0.0", isOptimized: true, emissionKind: FullDebug, globals: !25, splitDebugInlining: false, nameTableKind: None) +!23 = distinct !DISubprogram(name: "test", scope: !21, file: !21, line: 1, unit: !22) +!24 = !DILocalVariable(name: "x", scope: !23) +!25 = !{} +!26 = !{i32 2, !"Dwarf Version", i32 4} +!27 = !{i32 1, !"Debug Info Version", i32 3}