diff --git a/llvm/include/llvm/CodeGen/TailDuplicator.h b/llvm/include/llvm/CodeGen/TailDuplicator.h --- a/llvm/include/llvm/CodeGen/TailDuplicator.h +++ b/llvm/include/llvm/CodeGen/TailDuplicator.h @@ -87,11 +87,13 @@ /// of predecessors that received a copy of \p MBB. /// If \p RemovalCallback is non-null. It will be called before MBB is /// deleted. + /// If \p CandidatePtr is not null, duplicate into these blocks only. bool tailDuplicateAndUpdate( bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl *DuplicatedPreds = nullptr, - function_ref *RemovalCallback = nullptr); + function_ref *RemovalCallback = nullptr, + SmallVectorImpl *CandidatePtr = nullptr); private: using RegSubRegPair = TargetInstrInfo::RegSubRegPair; @@ -119,7 +121,8 @@ MachineBasicBlock *TailBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, - SmallVectorImpl &Copies); + SmallVectorImpl &Copies, + SmallVectorImpl *CandidatePtr); void appendCopies(MachineBasicBlock *MBB, SmallVectorImpl> &CopyInfos, SmallVectorImpl &Copies); diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -374,6 +374,9 @@ /// must be done inline. TailDuplicator TailDup; + /// Partial tail duplication threshold. + BlockFrequency DupThreshold; + /// Allocator and owner of BlockChain structures. /// /// We build BlockChains lazily while processing the loop structure of @@ -399,6 +402,10 @@ SmallPtrSet BlocksWithUnanalyzableExits; #endif + /// Scale the DupThreshold according to basic block size. + BlockFrequency scaleThreshold(MachineBasicBlock *BB); + void initDupThreshold(); + /// Decrease the UnscheduledPredecessors count for all blocks in chain, and /// if the count goes to 0, add them to the appropriate work list. void markChainSuccessors( @@ -421,6 +428,11 @@ const MachineBasicBlock *BB, const MachineBasicBlock *Succ, const BlockChain &Chain, const BlockFilterSet *BlockFilter, BranchProbability SuccProb, BranchProbability HotProb); + bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred, + BlockFilterSet *BlockFilter); + void findDuplicateCandidates(SmallVectorImpl &Candidates, + MachineBasicBlock *BB, + BlockFilterSet *BlockFilter); bool repeatedlyTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *&LPred, const MachineBasicBlock *LoopHeaderBB, @@ -1141,6 +1153,11 @@ if (NumDup == 0) return false; + // If profile information is available, findDuplicateCandidates can do more + // precise benefit analysis. + if (F->getFunction().hasProfileData()) + return true; + // This is mainly for function exit BB. // The integrated tail duplication is really designed for increasing // fallthrough from predecessors from Succ to its successors. We may need @@ -1169,9 +1186,6 @@ // // A small number of extra duplication may not hurt too much. We need a better // heuristic to handle it. - // - // FIXME: we should selectively tail duplicate a BB into part of its - // predecessors. if ((NumDup > Succ->succ_size()) || !Duplicate) return false; @@ -1799,11 +1813,11 @@ // Placement may have changed tail duplication opportunities. // Check for that now. if (allowTailDupPlacement() && BestSucc && ShouldTailDup) { - // If the chosen successor was duplicated into all its predecessors, - // don't bother laying it out, just go round the loop again with BB as - // the chain end. - if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain, - BlockFilter, PrevUnplacedBlockIt)) + repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain, + BlockFilter, PrevUnplacedBlockIt); + // If the chosen successor was duplicated into BB, don't bother laying + // it out, just go round the loop again with BB as the chain end. + if (!BB->isSuccessor(BestSucc)) continue; } @@ -3011,8 +3025,18 @@ SmallVector DuplicatedPreds; bool IsSimple = TailDup.isSimpleBB(BB); - TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, - &DuplicatedPreds, &RemovalCallbackRef); + SmallVector CandidatePreds; + SmallVectorImpl *CandidatePtr = nullptr; + if (F->getFunction().hasProfileData()) { + // We can do partial duplication with precise profile information. + findDuplicateCandidates(CandidatePreds, BB, BlockFilter); + if (CandidatePreds.size() == 0) + return false; + if (CandidatePreds.size() < BB->pred_size()) + CandidatePtr = &CandidatePreds; + } + TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds, + &RemovalCallbackRef, CandidatePtr); // Update UnscheduledPredecessors to reflect tail-duplication. DuplicatedToLPred = false; @@ -3035,6 +3059,191 @@ return Removed; } +// Count the number of actual machine instructions. +static uint64_t countMBBInstruction(MachineBasicBlock *MBB) { + uint64_t InstrCount = 0; + for (MachineInstr &MI : *MBB) { + if (!MI.isPHI() && !MI.isMetaInstruction()) + InstrCount += 1; + } + return InstrCount; +} + +// The size cost of duplication is the instruction size of the duplicated block. +// So we should scale the threshold accordingly. But the instruction size is not +// available on all targets, so we use the number of instructions instead. +BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) { + return DupThreshold.getFrequency() * countMBBInstruction(BB); +} + +// Returns true if BB is Pred's best successor. +bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB, + MachineBasicBlock *Pred, + BlockFilterSet *BlockFilter) { + if (BB == Pred) + return false; + if (BlockFilter && !BlockFilter->count(Pred)) + return false; + BlockChain *PredChain = BlockToChain[Pred]; + if (PredChain && (Pred != *std::prev(PredChain->end()))) + return false; + + // Find the successor with largest probability excluding BB. + BranchProbability BestProb = BranchProbability::getZero(); + for (MachineBasicBlock *Succ : Pred->successors()) + if (Succ != BB) { + if (BlockFilter && !BlockFilter->count(Succ)) + continue; + BlockChain *SuccChain = BlockToChain[Succ]; + if (SuccChain && (Succ != *SuccChain->begin())) + continue; + BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ); + if (SuccProb > BestProb) + BestProb = SuccProb; + } + + BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB); + if (BBProb <= BestProb) + return false; + + // Compute the number of reduced taken branches if Pred falls through to BB + // instead of another successor. Then compare it with threshold. + BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); + BlockFrequency Gain = PredFreq * (BBProb - BestProb); + return Gain > scaleThreshold(BB); +} + +// Find out the predecessors of BB and BB can be beneficially duplicated into +// them. +void MachineBlockPlacement::findDuplicateCandidates( + SmallVectorImpl &Candidates, + MachineBasicBlock *BB, + BlockFilterSet *BlockFilter) { + MachineBasicBlock *Fallthrough = nullptr; + BranchProbability DefaultBranchProb = BranchProbability::getZero(); + BlockFrequency BBDupThreshold(scaleThreshold(BB)); + SmallVector Preds(BB->pred_begin(), BB->pred_end()); + SmallVector Succs(BB->succ_begin(), BB->succ_end()); + + // Sort for highest frequency. + auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) { + return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B); + }; + auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) { + return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B); + }; + llvm::stable_sort(Succs, CmpSucc); + llvm::stable_sort(Preds, CmpPred); + + auto SuccIt = Succs.begin(); + if (SuccIt != Succs.end()) { + DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl(); + } + + // For each predecessors of BB, compute the benefit of duplicating BB, + // if it is larger than the threshold, add it into Candidates. + // + // If we have following control flow. + // + // PB1 PB2 PB3 PB4 + // \ | / /\ + // \ | / / \ + // \ |/ / \ + // BB----/ OB + // /\ + // / \ + // SB1 SB2 + // + // And it can be partially duplicated as + // + // PB2+BB + // | PB1 PB3 PB4 + // | | / /\ + // | | / / \ + // | |/ / \ + // | BB----/ OB + // |\ /| + // | X | + // |/ \| + // SB2 SB1 + // + // The benefit of duplicating into a predecessor is defined as + // Orig_taken_branch - Duplicated_taken_branch + // + // The Orig_taken_branch is computed with the assumption that predecessor + // jumps to BB and the most possible successor is laid out after BB. + // + // The Duplicated_taken_branch is computed with the assumption that BB is + // duplicated into PB, and one successor is layout after it (SB1 for PB1 and + // SB2 for PB2 in our case). If there is no available successor, the combined + // block jumps to all BB's successor, like PB3 in this example. + // + // If a predecessor has multiple successors, so BB can't be duplicated into + // it. But it can beneficially fall through to BB, and duplicate BB into other + // predecessors. + for (MachineBasicBlock *Pred : Preds) { + BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); + + if (!TailDup.canTailDuplicate(BB, Pred)) { + // BB can't be duplicated into Pred, but it is possible to be layout + // below Pred. + if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) { + Fallthrough = Pred; + if (SuccIt != Succs.end()) + SuccIt++; + } + continue; + } + + BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb; + BlockFrequency DupCost; + if (SuccIt == Succs.end()) { + // Jump to all successors; + if (Succs.size() > 0) + DupCost += PredFreq; + } else { + // Fallthrough to *SuccIt, jump to all other successors; + DupCost += PredFreq; + DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt); + } + + assert(OrigCost >= DupCost); + OrigCost -= DupCost; + if (OrigCost > BBDupThreshold) { + Candidates.push_back(Pred); + if (SuccIt != Succs.end()) + SuccIt++; + } + } + + // No predecessors can optimally fallthrough to BB. + // So we can change one duplication into fallthrough. + if (!Fallthrough) { + if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) { + Candidates[0] = Candidates.back(); + Candidates.pop_back(); + } + } +} + +void MachineBlockPlacement::initDupThreshold() { + DupThreshold = 0; + if (!F->getFunction().hasProfileData()) + return; + + BlockFrequency MaxFreq = 0; + for (MachineBasicBlock &MBB : *F) { + BlockFrequency Freq = MBFI->getBlockFreq(&MBB); + if (Freq > MaxFreq) + MaxFreq = Freq; + } + + // FIXME: we may use profile count instead of frequency, + // and need more fine tuning. + BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); + DupThreshold = MaxFreq * ThresholdProb; +} + bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -3053,6 +3262,8 @@ MPDT = nullptr; PSI = &getAnalysis().getPSI(); + initDupThreshold(); + // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. PreferredLoopExit = nullptr; diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -159,14 +159,16 @@ bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl *DuplicatedPreds, - function_ref *RemovalCallback) { + function_ref *RemovalCallback, + SmallVectorImpl *CandidatePtr) { // Save the successors list. SmallSetVector Succs(MBB->succ_begin(), MBB->succ_end()); SmallVector TDBBs; SmallVector Copies; - if (!tailDuplicate(IsSimple, MBB, ForcedLayoutPred, TDBBs, Copies)) + if (!tailDuplicate(IsSimple, MBB, ForcedLayoutPred, + TDBBs, Copies, CandidatePtr)) return false; ++NumTails; @@ -804,9 +806,10 @@ /// \p Copies A vector of copy instructions inserted. Used later to /// walk all the inserted copies and remove redundant ones. bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB, - MachineBasicBlock *ForcedLayoutPred, - SmallVectorImpl &TDBBs, - SmallVectorImpl &Copies) { + MachineBasicBlock *ForcedLayoutPred, + SmallVectorImpl &TDBBs, + SmallVectorImpl &Copies, + SmallVectorImpl *CandidatePtr) { LLVM_DEBUG(dbgs() << "\n*** Tail-duplicating " << printMBBReference(*TailBB) << '\n'); @@ -820,8 +823,12 @@ // block into them, if possible. Copying the list ahead of time also // avoids trouble with the predecessor list reallocating. bool Changed = false; - SmallSetVector Preds(TailBB->pred_begin(), - TailBB->pred_end()); + SmallSetVector Preds; + if (CandidatePtr) + Preds.insert(CandidatePtr->begin(), CandidatePtr->end()); + else + Preds.insert(TailBB->pred_begin(), TailBB->pred_end()); + for (MachineBasicBlock *PredBB : Preds) { assert(TailBB != PredBB && "Single-block loop should have been rejected earlier!"); @@ -830,13 +837,17 @@ continue; // Don't duplicate into a fall-through predecessor (at least for now). - bool IsLayoutSuccessor = false; - if (ForcedLayoutPred) - IsLayoutSuccessor = (ForcedLayoutPred == PredBB); - else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) - IsLayoutSuccessor = true; - if (IsLayoutSuccessor) - continue; + // If profile is available, findDuplicateCandidates can choose better + // fall-through predecessor. + if (!(MF->getFunction().hasProfileData() && LayoutMode)) { + bool IsLayoutSuccessor = false; + if (ForcedLayoutPred) + IsLayoutSuccessor = (ForcedLayoutPred == PredBB); + else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) + IsLayoutSuccessor = true; + if (IsLayoutSuccessor) + continue; + } LLVM_DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB << "From Succ: " << *TailBB); diff --git a/llvm/test/CodeGen/X86/partial-tail-dup.ll b/llvm/test/CodeGen/X86/partial-tail-dup.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/partial-tail-dup.ll @@ -0,0 +1,187 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + + +@gvar = external global i32 + +; dupbb has two predecessors, p1 and p2. p1 is hot, p2 is cold. So dupbb +; should be placed after p1, and not duplicated into p2. +; +; CHECK-LABEL: test1 +; CHECK: %p1 +; CHECK: .LBB0_4: # %dupbb +; CHECK: %p2 +; CHECK: jmp .LBB0_4 + +define void @test1(i32* %p) !prof !1 { +entry: + br label %header + +header: + %call = call zeroext i1 @a() + br i1 %call, label %p1, label %p2, !prof !2 + +p1: + call void @b() + br label %dupbb + +p2: + call void @c() + br label %dupbb + +dupbb: + %cond = icmp eq i32* @gvar, %p + br i1 %cond, label %header, label %latch, !prof !3 + +latch: + %call3 = call zeroext i1 @a() + br i1 %call3, label %header, label %end, !prof !2 + +end: + ret void +} + + +; dupbb has four predecessors p1, p2, p3 and p4. p1 and p2 are hot, p3 and p4 +; are cold. So dupbb should be placed after p1, duplicated into p2. p3 and p4 +; should jump to dupbb. +; +; CHECK-LABEL: test2 +; CHECK: %p1 +; CHECK: .LBB1_8: # %dupbb +; +; CHECK: %p2 +; CHECK: callq c +; CHECK-NEXT: cmpq +; CHECK-NEXT: je +; CHECK-NEXT: jmp +; +; CHECK: %p3 +; CHECK: jmp .LBB1_8 +; CHECK: %p4 +; CHECK: jmp .LBB1_8 + +define void @test2(i32* %p) !prof !1 { +entry: + br label %header + +header: + %call = call zeroext i1 @a() + br i1 %call, label %bb1, label %bb2, !prof !2 + +bb1: + %call1 = call zeroext i1 @a() + br i1 %call1, label %p1, label %p2, !prof !4 + +bb2: + %call2 = call zeroext i1 @a() + br i1 %call2, label %p3, label %p4, !prof !4 + +p1: + call void @b() + br label %dupbb + +p2: + call void @c() + br label %dupbb + +p3: + call void @d() + br label %dupbb + +p4: + call void @e() + br label %dupbb + +dupbb: + %cond = icmp eq i32* @gvar, %p + br i1 %cond, label %bb3, label %bb4, !prof !4 + +bb3: + call void @b() + br label %bb4 + +bb4: + %call4 = call zeroext i1 @a() + br i1 %call4, label %header, label %latch, !prof !3 + +latch: + %call3 = call zeroext i1 @a() + br i1 %call3, label %header, label %end, !prof !2 + +end: + ret void +} + + +; dupbb has three predecessors p1, p2 and p3. p3 has two successors, so dupbb +; can't be duplicated into p3, but it should not block it to be duplicated into +; other predecessors. +; +; CHECK-LABEL: test3 +; CHECK: %p1 +; CHECK: .LBB2_6: # %dupbb +; +; CHECK: %p2 +; CHECK: callq c +; CHECK: cmpq +; CHECK-NEXT: je +; CHECK-NEXT: jmp +; +; CHECK: %p3 +; CHECK: jne .LBB2_6 + +define void @test3(i32* %p) !prof !1 { +entry: + br label %header + +header: + %call = call zeroext i1 @a() + br i1 %call, label %bb1, label %p3, !prof !2 + +bb1: + %call1 = call zeroext i1 @a() + br i1 %call1, label %p1, label %p2, !prof !4 + +p1: + call void @b() + br label %dupbb + +p2: + call void @c() + br label %dupbb + +p3: + %call2 = call zeroext i1 @a() + br i1 %call2, label %dupbb, label %bb4, !prof !4 + +dupbb: + %cond = icmp eq i32* @gvar, %p + br i1 %cond, label %bb3, label %bb4, !prof !4 + +bb3: + call void @b() + br label %bb4 + +bb4: + %call4 = call zeroext i1 @a() + br i1 %call4, label %header, label %latch, !prof !3 + +latch: + %call3 = call zeroext i1 @a() + br i1 %call3, label %header, label %end, !prof !2 + +end: + ret void +} + +declare zeroext i1 @a() +declare void @b() +declare void @c() +declare void @d() +declare void @e() +declare void @f() + +!1 = !{!"function_entry_count", i64 1000} +!2 = !{!"branch_weights", i32 100, i32 1} +!3 = !{!"branch_weights", i32 1, i32 100} +!4 = !{!"branch_weights", i32 60, i32 40}