Index: llvm/include/llvm/CodeGen/TailDuplicator.h =================================================================== --- llvm/include/llvm/CodeGen/TailDuplicator.h +++ llvm/include/llvm/CodeGen/TailDuplicator.h @@ -90,7 +90,8 @@ bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl *DuplicatedPreds = nullptr, - function_ref *RemovalCallback = nullptr); + function_ref *RemovalCallback = nullptr, + SmallVectorImpl *CandidatePtr = nullptr); private: using RegSubRegPair = TargetInstrInfo::RegSubRegPair; @@ -118,7 +119,8 @@ MachineBasicBlock *TailBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, - SmallVectorImpl &Copies); + SmallVectorImpl &Copies, + SmallVectorImpl *CandidatePtr); void appendCopies(MachineBasicBlock *MBB, SmallVectorImpl> &CopyInfos, SmallVectorImpl &Copies); Index: llvm/lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -374,6 +374,9 @@ /// must be done inline. TailDuplicator TailDup; + /// Partial tail duplication threshold. + BlockFrequency DupThreshold; + /// Allocator and owner of BlockChain structures. /// /// We build BlockChains lazily while processing the loop structure of @@ -399,6 +402,8 @@ SmallPtrSet BlocksWithUnanalyzableExits; #endif + void initDupThreshold(); + /// Decrease the UnscheduledPredecessors count for all blocks in chain, and /// if the count goes to 0, add them to the appropriate work list. void markChainSuccessors( @@ -421,6 +426,11 @@ const MachineBasicBlock *BB, const MachineBasicBlock *Succ, const BlockChain &Chain, const BlockFilterSet *BlockFilter, BranchProbability SuccProb, BranchProbability HotProb); + bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred, + BlockFilterSet *BlockFilter); + void findDuplicateCandidates(SmallVectorImpl &Candidates, + MachineBasicBlock *BB, + BlockFilterSet *BlockFilter); bool repeatedlyTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *&LPred, const MachineBasicBlock *LoopHeaderBB, @@ -1141,6 +1151,11 @@ if (NumDup == 0) return false; + // If profile information is available, findDuplicateCandidates can do more + // precise benefit analysis. + if (F->getFunction().hasProfileData()) + return true; + // This is mainly for function exit BB. // The integrated tail duplication is really designed for increasing // fallthrough from predecessors from Succ to its successors. We may need @@ -1169,9 +1184,6 @@ // // A small number of extra duplication may not hurt too much. We need a better // heuristic to handle it. - // - // FIXME: we should selectively tail duplicate a BB into part of its - // predecessors. if ((NumDup > Succ->succ_size()) || !Duplicate) return false; @@ -1799,11 +1811,11 @@ // Placement may have changed tail duplication opportunities. // Check for that now. if (allowTailDupPlacement() && BestSucc && ShouldTailDup) { - // If the chosen successor was duplicated into all its predecessors, - // don't bother laying it out, just go round the loop again with BB as - // the chain end. - if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain, - BlockFilter, PrevUnplacedBlockIt)) + repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain, + BlockFilter, PrevUnplacedBlockIt); + // If the chosen successor was duplicated into BB, don't bother laying + // it out, just go round the loop again with BB as the chain end. + if (!BB->isSuccessor(BestSucc)) continue; } @@ -3012,8 +3024,18 @@ SmallVector DuplicatedPreds; bool IsSimple = TailDup.isSimpleBB(BB); - TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, - &DuplicatedPreds, &RemovalCallbackRef); + SmallVector CandidatePreds; + SmallVectorImpl *CandidatePtr = nullptr; + if (F->getFunction().hasProfileData()) { + // We can do partial duplication with precise profile information. + findDuplicateCandidates(CandidatePreds, BB, BlockFilter); + if (CandidatePreds.size() == 0) + return false; + if (CandidatePreds.size() < BB->pred_size()) + CandidatePtr = &CandidatePreds; + } + TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds, + &RemovalCallbackRef, CandidatePtr); // Update UnscheduledPredecessors to reflect tail-duplication. DuplicatedToLPred = false; @@ -3036,6 +3058,142 @@ return Removed; } +// Count the number of actual machine instructions. +static uint64_t countMBBInstruction(MachineBasicBlock *MBB) { + uint64_t InstrCount = 0; + for (MachineInstr &MI : *MBB) { + if (!MI.isPHI() && !MI.isMetaInstruction()) + InstrCount += 1; + } + return InstrCount; +} + +// Returns true if BB is Pred's most possible successor. +bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB, + MachineBasicBlock *Pred, + BlockFilterSet *BlockFilter) { + if (BB == Pred) + return false; + if (BlockFilter && !BlockFilter->count(Pred)) + return false; + BlockChain *PredChain = BlockToChain[Pred]; + if (PredChain && (Pred != *std::prev(PredChain->end()))) + return false; + + // Find the successor with largest probability. + BranchProbability BestProb = BranchProbability::getZero(); + for (MachineBasicBlock *Succ : Pred->successors()) + if (Succ != BB) { + if (BlockFilter && !BlockFilter->count(Succ)) + continue; + BlockChain *SuccChain = BlockToChain[Succ]; + if (SuccChain && (Succ != *SuccChain->begin())) + continue; + BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ); + if (SuccProb > BestProb) + BestProb = SuccProb; + } + + BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB); + if (BBProb <= BestProb) + return false; + + BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); + BlockFrequency Gain = PredFreq * (BBProb - BestProb); + return Gain > DupThreshold.getFrequency() * countMBBInstruction(BB); +} + +void MachineBlockPlacement::findDuplicateCandidates( + SmallVectorImpl &Candidates, + MachineBasicBlock *BB, + BlockFilterSet *BlockFilter) { + MachineBasicBlock *Fallthrough = nullptr; + BranchProbability DefaultBranchProb = BranchProbability::getZero(); + BlockFrequency BBDupThreshold(DupThreshold.getFrequency() * countMBBInstruction(BB)); + SmallVector Preds(BB->pred_begin(), BB->pred_end()); + SmallVector Succs(BB->succ_begin(), BB->succ_end()); + + // Sort for highest frequency. + auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) { + return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B); + }; + auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) { + return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B); + }; + llvm::stable_sort(Succs, CmpSucc); + llvm::stable_sort(Preds, CmpPred); + + auto SuccIt = Succs.begin(); + if (SuccIt != Succs.end()) { + DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl(); + } + + // For each predecessors of BB, compute the benefit of duplicating BB, + // if it is larger than the threshold, add it into Candidates. + for (MachineBasicBlock *Pred : Preds) { + BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); + + if (!TailDup.canTailDuplicate(BB, Pred)) { + // BB can't be duplicated into Pred, but it is possible to be layout + // below Pred. + if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) { + Fallthrough = Pred; + if (SuccIt != Succs.end()) + SuccIt++; + } + continue; + } + + BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb; + BlockFrequency DupCost; + if (SuccIt == Succs.end()) { + // Jump to all successors; + if (Succs.size() > 0) + DupCost += PredFreq; + } else { + // Fallthrough to *SuccIt, jump to all other successors; + DupCost += PredFreq; + DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt); + } + + if (DupCost >= OrigCost) + continue; + OrigCost -= DupCost; + if (OrigCost > BBDupThreshold) { + Candidates.push_back(Pred); + if (SuccIt != Succs.end()) + SuccIt++; + } + } + + // No predecessors can optimally fallthrough to BB. + // So we can change one duplication into fallthrough. + if (!Fallthrough) { + if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) { + Candidates[0] = Candidates.back(); + Candidates.pop_back(); + } + } +} + +void MachineBlockPlacement::initDupThreshold() { + DupThreshold = 0; + if (!F->getFunction().hasProfileData()) + return; + + BlockFrequency MaxFreq = 0; + for (MachineBasicBlock &MBB : *F) { + BlockFrequency Freq = MBFI->getBlockFreq(&MBB); + if (Freq > MaxFreq) + MaxFreq = Freq; + } + + // FIXME: we may use profile count instead of frequency, + // and need more fine tuning. + BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); + DupThreshold = MaxFreq * ThresholdProb; +} + bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -3054,6 +3212,8 @@ MPDT = nullptr; PSI = &getAnalysis().getPSI(); + initDupThreshold(); + // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. PreferredLoopExit = nullptr; Index: llvm/lib/CodeGen/TailDuplicator.cpp =================================================================== --- llvm/lib/CodeGen/TailDuplicator.cpp +++ llvm/lib/CodeGen/TailDuplicator.cpp @@ -159,14 +159,16 @@ bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl *DuplicatedPreds, - function_ref *RemovalCallback) { + function_ref *RemovalCallback, + SmallVectorImpl *CandidatePtr) { // Save the successors list. SmallSetVector Succs(MBB->succ_begin(), MBB->succ_end()); SmallVector TDBBs; SmallVector Copies; - if (!tailDuplicate(IsSimple, MBB, ForcedLayoutPred, TDBBs, Copies)) + if (!tailDuplicate(IsSimple, MBB, ForcedLayoutPred, + TDBBs, Copies, CandidatePtr)) return false; ++NumTails; @@ -802,9 +804,10 @@ /// \p Copies A vector of copy instructions inserted. Used later to /// walk all the inserted copies and remove redundant ones. bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB, - MachineBasicBlock *ForcedLayoutPred, - SmallVectorImpl &TDBBs, - SmallVectorImpl &Copies) { + MachineBasicBlock *ForcedLayoutPred, + SmallVectorImpl &TDBBs, + SmallVectorImpl &Copies, + SmallVectorImpl *CandidatePtr) { LLVM_DEBUG(dbgs() << "\n*** Tail-duplicating " << printMBBReference(*TailBB) << '\n'); @@ -818,8 +821,12 @@ // block into them, if possible. Copying the list ahead of time also // avoids trouble with the predecessor list reallocating. bool Changed = false; - SmallSetVector Preds(TailBB->pred_begin(), - TailBB->pred_end()); + SmallSetVector Preds; + if (CandidatePtr) + Preds.insert(CandidatePtr->begin(), CandidatePtr->end()); + else + Preds.insert(TailBB->pred_begin(), TailBB->pred_end()); + for (MachineBasicBlock *PredBB : Preds) { assert(TailBB != PredBB && "Single-block loop should have been rejected earlier!"); @@ -828,13 +835,17 @@ continue; // Don't duplicate into a fall-through predecessor (at least for now). - bool IsLayoutSuccessor = false; - if (ForcedLayoutPred) - IsLayoutSuccessor = (ForcedLayoutPred == PredBB); - else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) - IsLayoutSuccessor = true; - if (IsLayoutSuccessor) - continue; + // If profile is available, findDuplicateCandidates can choose better + // fall-through predecessor. + if (!MF->getFunction().hasProfileData()) { + bool IsLayoutSuccessor = false; + if (ForcedLayoutPred) + IsLayoutSuccessor = (ForcedLayoutPred == PredBB); + else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) + IsLayoutSuccessor = true; + if (IsLayoutSuccessor) + continue; + } LLVM_DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB << "From Succ: " << *TailBB); Index: llvm/test/CodeGen/X86/partial-tail-dup.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/partial-tail-dup.ll @@ -0,0 +1,187 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + + +@gvar = external global i32 + +; dupbb has two predecessors, p1 and p2. p1 is hot, p2 is cold. So dupbb +; should be placed after p1, and not duplicated into p2. +; +; CHECK-LABEL: test1 +; CHECK: %p1 +; CHECK: .LBB0_4: # %dupbb +; CHECK: %p2 +; CHECK: jmp .LBB0_4 + +define void @test1(i32* %p) !prof !1 { +entry: + br label %header + +header: + %call = call zeroext i1 @a() + br i1 %call, label %p1, label %p2, !prof !2 + +p1: + call void @b() + br label %dupbb + +p2: + call void @c() + br label %dupbb + +dupbb: + %cond = icmp eq i32* @gvar, %p + br i1 %cond, label %header, label %latch, !prof !3 + +latch: + %call3 = call zeroext i1 @a() + br i1 %call3, label %header, label %end, !prof !2 + +end: + ret void +} + + +; dupbb has four predecessors p1, p2, p3 and p4. p1 and p2 are hot, p3 and p4 +; are cold. So dupbb should be placed after p1, duplicated into p2. p3 and p4 +; should jump to dupbb. +; +; CHECK-LABEL: test2 +; CHECK: %p1 +; CHECK: .LBB1_8: # %dupbb +; +; CHECK: %p2 +; CHECK: callq c +; CHECK-NEXT: cmpq +; CHECK-NEXT: je +; CHECK-NEXT: jmp +; +; CHECK: %p3 +; CHECK: jmp .LBB1_8 +; CHECK: %p4 +; CHECK: jmp .LBB1_8 + +define void @test2(i32* %p) !prof !1 { +entry: + br label %header + +header: + %call = call zeroext i1 @a() + br i1 %call, label %bb1, label %bb2, !prof !2 + +bb1: + %call1 = call zeroext i1 @a() + br i1 %call1, label %p1, label %p2, !prof !4 + +bb2: + %call2 = call zeroext i1 @a() + br i1 %call2, label %p3, label %p4, !prof !4 + +p1: + call void @b() + br label %dupbb + +p2: + call void @c() + br label %dupbb + +p3: + call void @d() + br label %dupbb + +p4: + call void @e() + br label %dupbb + +dupbb: + %cond = icmp eq i32* @gvar, %p + br i1 %cond, label %bb3, label %bb4, !prof !4 + +bb3: + call void @b() + br label %bb4 + +bb4: + %call4 = call zeroext i1 @a() + br i1 %call4, label %header, label %latch, !prof !3 + +latch: + %call3 = call zeroext i1 @a() + br i1 %call3, label %header, label %end, !prof !2 + +end: + ret void +} + + +; dupbb has three predecessors p1, p2 and p3. p3 has two successors, so dupbb +; can't be duplicated into p3, but it should not block it to be duplicated into +; other predecessors. +; +; CHECK-LABEL: test3 +; CHECK: %p1 +; CHECK: .LBB2_6: # %dupbb +; +; CHECK: %p2 +; CHECK: callq c +; CHECK: cmpq +; CHECK-NEXT: je +; CHECK-NEXT: jmp +; +; CHECK: %p3 +; CHECK: jne .LBB2_6 + +define void @test3(i32* %p) !prof !1 { +entry: + br label %header + +header: + %call = call zeroext i1 @a() + br i1 %call, label %bb1, label %p3, !prof !2 + +bb1: + %call1 = call zeroext i1 @a() + br i1 %call1, label %p1, label %p2, !prof !4 + +p1: + call void @b() + br label %dupbb + +p2: + call void @c() + br label %dupbb + +p3: + %call2 = call zeroext i1 @a() + br i1 %call2, label %dupbb, label %bb4, !prof !4 + +dupbb: + %cond = icmp eq i32* @gvar, %p + br i1 %cond, label %bb3, label %bb4, !prof !4 + +bb3: + call void @b() + br label %bb4 + +bb4: + %call4 = call zeroext i1 @a() + br i1 %call4, label %header, label %latch, !prof !3 + +latch: + %call3 = call zeroext i1 @a() + br i1 %call3, label %header, label %end, !prof !2 + +end: + ret void +} + +declare zeroext i1 @a() +declare void @b() +declare void @c() +declare void @d() +declare void @e() +declare void @f() + +!1 = !{!"function_entry_count", i64 1000} +!2 = !{!"branch_weights", i32 100, i32 1} +!3 = !{!"branch_weights", i32 1, i32 100} +!4 = !{!"branch_weights", i32 60, i32 40}