Index: llvm/lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -177,6 +177,14 @@ cl::init(2), cl::Hidden); +// Heuristic for tail duplication if profile count is used in cost model. +static cl::opt TailDupProfileCountThreshold( + "tail-dup-profile-count-threshold", + cl::desc("If profile count information is used in tail duplication cost " + "model, the gained fall through number from tail duplication " + "should be at least this percent of hot count."), + cl::init(50), cl::Hidden); + // Heuristic for triangle chains. static cl::opt TriangleChainCount( "triangle-chain-count", @@ -377,6 +385,10 @@ /// Partial tail duplication threshold. BlockFrequency DupThreshold; + /// True: use block profile count to compute tail duplication cost. + /// False: use block frequency to compute tail duplication cost. + bool UseProfileCount; + /// Allocator and owner of BlockChain structures. /// /// We build BlockChains lazily while processing the loop structure of @@ -402,6 +414,19 @@ SmallPtrSet BlocksWithUnanalyzableExits; #endif + /// Get block profile count or frequency according to UseProfileCount. + /// The return value is used to model tail duplication cost. + BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) { + if (UseProfileCount) { + auto Count = MBFI->getMBFI().getBlockProfileCount(BB); + if (Count) + return *Count; + else + return 0; + } else + return MBFI->getBlockFreq(BB); + } + /// Scale the DupThreshold according to basic block size. BlockFrequency scaleThreshold(MachineBasicBlock *BB); void initDupThreshold(); @@ -3120,7 +3145,7 @@ // Compute the number of reduced taken branches if Pred falls through to BB // instead of another successor. Then compare it with threshold. - BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); + BlockFrequency PredFreq = getBlockCountOrFrequency(Pred); BlockFrequency Gain = PredFreq * (BBProb - BestProb); return Gain > scaleThreshold(BB); } @@ -3134,8 +3159,17 @@ MachineBasicBlock *Fallthrough = nullptr; BranchProbability DefaultBranchProb = BranchProbability::getZero(); BlockFrequency BBDupThreshold(scaleThreshold(BB)); + SmallVector Preds(BB->pred_begin(), BB->pred_end()); - SmallVector Succs(BB->succ_begin(), BB->succ_end()); + SmallVector Succs; + for (MachineBasicBlock *Succ : BB->successors()) { + if (BlockFilter && !BlockFilter->count(Succ)) + continue; + BlockChain *SuccChain = BlockToChain[Succ]; + if (SuccChain && (Succ != *SuccChain->begin())) + continue; + Succs.push_back(Succ); + } // Sort for highest frequency. auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) { @@ -3194,7 +3228,7 @@ // it. But it can beneficially fall through to BB, and duplicate BB into other // predecessors. for (MachineBasicBlock *Pred : Preds) { - BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); + BlockFrequency PredFreq = getBlockCountOrFrequency(Pred); if (!TailDup.canTailDuplicate(BB, Pred)) { // BB can't be duplicated into Pred, but it is possible to be layout @@ -3243,6 +3277,15 @@ if (!F->getFunction().hasProfileData()) return; + // We prefer to use prifile count. + uint64_t HotThreshold = PSI->getOrCompHotCountThreshold(); + if (HotThreshold != UINT64_MAX) { + UseProfileCount = true; + DupThreshold = HotThreshold * TailDupProfileCountThreshold / 100; + return; + } + + // Profile count is not available, we can use block frequency instead. BlockFrequency MaxFreq = 0; for (MachineBasicBlock &MBB : *F) { BlockFrequency Freq = MBFI->getBlockFreq(&MBB); @@ -3250,10 +3293,9 @@ MaxFreq = Freq; } - // FIXME: we may use profile count instead of frequency, - // and need more fine tuning. BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); DupThreshold = MaxFreq * ThresholdProb; + UseProfileCount = false; } bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { Index: llvm/test/CodeGen/X86/dup-cost.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/dup-cost.ll @@ -0,0 +1,105 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Cold function, %dup should not be duplicated into predecessors. +define i32 @cold(i32 %a, i32* %p, i32* %q) !prof !21 { +; CHECK-LABEL: cold +; CHECK: %entry +; CHECK: %true1 +; CHECK: %dup +; CHECK: %true2 +; CHECK: %false1 +; CHECK: %false2 +entry: + %cond1 = icmp sgt i32 %a, 1 + br i1 %cond1, label %true1, label %false1, !prof !30 + +true1: + %v1 = load i32, i32* %p, align 4 + %v2 = add i32 %v1, 2 + br label %dup + +false1: + %v3 = load i32, i32* %q, align 4 + %v4 = sub i32 %v3, 3 + br label %dup + +dup: + %v5 = phi i32 [%v2, %true1], [%v4, %false1] + %cond2 = icmp sgt i32 %v5, 4 + br i1 %cond2, label %true2, label %false2, !prof !30 + +true2: + %v6 = xor i32 %v5, %a + br label %exit + +false2: + %v7 = and i32 %v5, %a + br label %exit + +exit: + %v8 = phi i32 [%v6, %true2], [%v7, %false2] + ret i32 %v8 +} + +; Same code as previous function, but with hot profile count. +; So %dup should be duplicated into predecessors. +define i32 @hot(i32 %a, i32* %p, i32* %q) !prof !22 { +; CHECK-LABEL: hot +; CHECK: %entry +; CHECK: %true1 +; CHECK: %false2 +; CHECK: %false1 +; CHECK: %true2 +entry: + %cond1 = icmp sgt i32 %a, 1 + br i1 %cond1, label %true1, label %false1, !prof !30 + +true1: + %v1 = load i32, i32* %p, align 4 + %v2 = add i32 %v1, 2 + br label %dup + +false1: + %v3 = load i32, i32* %q, align 4 + %v4 = sub i32 %v3, 3 + br label %dup + +dup: + %v5 = phi i32 [%v2, %true1], [%v4, %false1] + %cond2 = icmp sgt i32 %v5, 4 + br i1 %cond2, label %true2, label %false2, !prof !30 + +true2: + %v6 = xor i32 %v5, %a + br label %exit + +false2: + %v7 = and i32 %v5, %a + br label %exit + +exit: + %v8 = phi i32 [%v6, %true2], [%v7, %false2] + ret i32 %v8 +} + + +!llvm.module.flags = !{!1} +!21 = !{!"function_entry_count", i64 10} +!22 = !{!"function_entry_count", i64 400} + +!30 = !{!"branch_weights", i32 1, i32 1} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 10000} +!5 = !{!"MaxCount", i64 10} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 1000} +!8 = !{!"NumCounts", i64 3} +!9 = !{!"NumFunctions", i64 3} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 100, i32 1} +!13 = !{i32 999000, i64 100, i32 1} +!14 = !{i32 999999, i64 1, i32 2}