diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -607,6 +607,10 @@ /// operands in the successor blocks which refer to FromMBB to refer to this. void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB); + /// move all pseudo probes in this block to the end of /c ToMBB To and tag + /// them dangling. + void moveAndDanglePseudoProbes(MachineBasicBlock *ToMBB); + /// Return true if any of the successors have probabilities attached to them. bool hasSuccessorProbabilities() const { return !Probs.empty(); } @@ -667,16 +671,18 @@ /// Returns an iterator to the first non-debug instruction in the basic block, /// or end(). - iterator getFirstNonDebugInstr(); - const_iterator getFirstNonDebugInstr() const { - return const_cast(this)->getFirstNonDebugInstr(); + iterator getFirstNonDebugInstr(bool SkipPseudoOp = false); + const_iterator getFirstNonDebugInstr(bool SkipPseudoOp = false) const { + return const_cast(this)->getFirstNonDebugInstr( + SkipPseudoOp); } /// Returns an iterator to the last non-debug instruction in the basic block, /// or end(). - iterator getLastNonDebugInstr(); - const_iterator getLastNonDebugInstr() const { - return const_cast(this)->getLastNonDebugInstr(); + iterator getLastNonDebugInstr(bool SkipPseudoOp = false); + const_iterator getLastNonDebugInstr(bool SkipPseudoOp = false) const { + return const_cast(this)->getLastNonDebugInstr( + SkipPseudoOp); } /// Convenience function that returns true if the block ends in a return @@ -1065,9 +1071,11 @@ /// and return the resulting iterator. This function should only be used /// MachineBasicBlock::{iterator, const_iterator, instr_iterator, /// const_instr_iterator} and the respective reverse iterators. -template -inline IterT skipDebugInstructionsForward(IterT It, IterT End) { - while (It != End && It->isDebugInstr()) +template +inline IterT skipDebugInstructionsForward(IterT It, IterT End, + bool SkipPseudoOp = false) { + while (It != End && + (It->isDebugInstr() || (SkipPseudoOp && It->isPseudoProbe()))) ++It; return It; } @@ -1076,31 +1084,36 @@ /// and return the resulting iterator. This function should only be used /// MachineBasicBlock::{iterator, const_iterator, instr_iterator, /// const_instr_iterator} and the respective reverse iterators. -template -inline IterT skipDebugInstructionsBackward(IterT It, IterT Begin) { - while (It != Begin && It->isDebugInstr()) +template +inline IterT skipDebugInstructionsBackward(IterT It, IterT Begin, + bool SkipPseudoOp = false) { + while (It != Begin && + (It->isDebugInstr() || (SkipPseudoOp && It->isPseudoProbe()))) --It; return It; } /// Increment \p It, then continue incrementing it while it points to a debug /// instruction. A replacement for std::next. -template inline IterT next_nodbg(IterT It, IterT End) { - return skipDebugInstructionsForward(std::next(It), End); +template +inline IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp = false) { + return skipDebugInstructionsForward(std::next(It), End, SkipPseudoOp); } /// Decrement \p It, then continue decrementing it while it points to a debug /// instruction. A replacement for std::prev. -template inline IterT prev_nodbg(IterT It, IterT Begin) { - return skipDebugInstructionsBackward(std::prev(It), Begin); +template +inline IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp = false) { + return skipDebugInstructionsBackward(std::prev(It), Begin, SkipPseudoOp); } /// Construct a range iterator which begins at \p It and moves forwards until /// \p End is reached, skipping any debug instructions. template -inline auto instructionsWithoutDebug(IterT It, IterT End) { - return make_filter_range(make_range(It, End), [](const MachineInstr &MI) { - return !MI.isDebugInstr(); +inline auto instructionsWithoutDebug(IterT It, IterT End, + bool SkipPseudoOp = false) { + return make_filter_range(make_range(It, End), [=](const MachineInstr &MI) { + return !MI.isDebugInstr() && !(SkipPseudoOp && MI.isPseudoProbe()); }); } diff --git a/llvm/include/llvm/IR/PseudoProbe.h b/llvm/include/llvm/IR/PseudoProbe.h --- a/llvm/include/llvm/IR/PseudoProbe.h +++ b/llvm/include/llvm/IR/PseudoProbe.h @@ -81,12 +81,17 @@ uint32_t Type; uint32_t Attr; float Factor; + + bool isDangling() const { + return Attr & (uint32_t)PseudoProbeAttributes::Dangling; + } }; Optional extractProbe(const Instruction &Inst); void setProbeDistributionFactor(Instruction &Inst, float Factor); +bool moveAndDanglePseudoProbes(BasicBlock *From, Instruction *To); } // end namespace llvm #endif // LLVM_IR_PSEUDOPROBE_H diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -1217,7 +1217,7 @@ // Blocks should be considered empty if they contain only debug info; // else the debug info would affect codegen. static bool IsEmptyBlock(MachineBasicBlock *MBB) { - return MBB->getFirstNonDebugInstr() == MBB->end(); + return MBB->getFirstNonDebugInstr(true) == MBB->end(); } // Blocks with only debug info and branches should be considered the same @@ -1307,6 +1307,16 @@ for (MachineBasicBlock *PredBB : MBB.predecessors()) if (PredBB->succ_size() == 1) copyDebugInfoToPredecessor(TII, MBB, *PredBB); + + // For AutoFDO, if the block is removed, we won't be able to sample it. To + // avoid assigning a zero weight for BB, move all its pseudo probes into once + // of its predecessors or successors and mark them dangling. This should allow + // the counts inference a chance to get a more reasonable weight for the + // block. + if (!MBB.pred_empty()) + MBB.moveAndDanglePseudoProbes(*MBB.pred_begin()); + else if (!MBB.succ_empty()) + MBB.moveAndDanglePseudoProbes(*MBB.succ_begin()); } bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -254,12 +254,14 @@ return I; } -MachineBasicBlock::iterator MachineBasicBlock::getFirstNonDebugInstr() { +MachineBasicBlock::iterator +MachineBasicBlock::getFirstNonDebugInstr(bool SkipPseudoOp) { // Skip over begin-of-block dbg_value instructions. - return skipDebugInstructionsForward(begin(), end()); + return skipDebugInstructionsForward(begin(), end(), SkipPseudoOp); } -MachineBasicBlock::iterator MachineBasicBlock::getLastNonDebugInstr() { +MachineBasicBlock::iterator +MachineBasicBlock::getLastNonDebugInstr(bool SkipPseudoOp) { // Skip over end-of-block dbg_value instructions. instr_iterator B = instr_begin(), I = instr_end(); while (I != B) { @@ -267,6 +269,8 @@ // Return instruction that starts a bundle. if (I->isDebugInstr() || I->isInsideBundle()) continue; + if (SkipPseudoOp && I->isPseudoProbe()) + continue; return I; } // The block is all debug values. @@ -894,6 +898,32 @@ normalizeSuccProbs(); } +/// A block emptied (i.e., with all instructions moved out of it) won't be +/// sampled at run time. In such cases, AutoFDO will be informed of zero samples +/// collected for the block. This is not accurate and could lead to misleading +/// weights assigned for the block. A way to mitigate that is to treat such +/// block as having unknown counts in the AutoFDO profile loader and allow the +/// counts inference tool a chance to calculate a relatively reasonable weight +/// for it. This can be done by moving all pseudo probes in the emptied block +/// i.e, /c this, to before /c ToMBB and tag them dangling. Note that this is +/// not needed for dead blocks which really have a zero weight. It's per +/// transforms to decide whether to call this function or not. +void MachineBasicBlock::moveAndDanglePseudoProbes(MachineBasicBlock *ToMBB) { + SmallVector ToBeMoved; + for (MachineInstr &MI : instrs()) { + if (MI.isPseudoProbe()) { + MI.addPseudoProbeAttribute(PseudoProbeAttributes::Dangling); + ToBeMoved.push_back(&MI); + } + } + + MachineBasicBlock::iterator I = ToMBB->getFirstTerminator(); + for (MachineInstr *MI : ToBeMoved) { + MI->removeFromParent(); + ToMBB->insert(I, MI); + } +} + bool MachineBasicBlock::isPredecessor(const MachineBasicBlock *MBB) const { return is_contained(predecessors(), MBB); } diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -683,7 +683,7 @@ return false; if (TailBB->pred_empty()) return false; - MachineBasicBlock::iterator I = TailBB->getFirstNonDebugInstr(); + MachineBasicBlock::iterator I = TailBB->getFirstNonDebugInstr(true); if (I == TailBB->end()) return true; return I->isUnconditionalBranch(); @@ -779,6 +779,12 @@ assert(PredBB->succ_size() <= 1); } + // For AutoFDO, since BB is going to be removed, we won't be able to sample + // it. To avoid assigning a zero weight for BB, move all its pseudo probes + // into Succ and mark them dangling. This should allow the counts inference + // a chance to get a more reasonable weight for BB. + TailBB->moveAndDanglePseudoProbes(PredBB); + if (PredTBB) TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DL); diff --git a/llvm/lib/IR/PseudoProbe.cpp b/llvm/lib/IR/PseudoProbe.cpp --- a/llvm/lib/IR/PseudoProbe.cpp +++ b/llvm/lib/IR/PseudoProbe.cpp @@ -96,4 +96,38 @@ } } } + +void addPseudoProbeAttribute(PseudoProbeInst &Inst, + PseudoProbeAttributes Attr) { + IRBuilder<> Builder(&Inst); + uint32_t OldAttr = Inst.getAttributes()->getZExtValue(); + uint32_t NewAttr = OldAttr | (uint32_t)Attr; + if (OldAttr != NewAttr) + Inst.replaceUsesOfWith(Inst.getAttributes(), Builder.getInt32(NewAttr)); +} + +/// A block emptied (i.e., with all instructions moved out of it) won't be +/// sampled at run time. In such cases, AutoFDO will be informed of zero samples +/// collected for the block. This is not accurate and could lead to misleading +/// weights assigned for the block. A way to mitigate that is to treat such +/// block as having unknown counts in the AutoFDO profile loader and allow the +/// counts inference tool a chance to calculate a relatively reasonable weight +/// for it. This can be done by moving all pseudo probes in the emptied block +/// i.e, /c From, to before /c To and tag them dangling. Note that this is +/// not needed for dead blocks which really have a zero weight. It's per +/// transforms to decide whether to call this function or not. +bool moveAndDanglePseudoProbes(BasicBlock *From, Instruction *To) { + SmallVector ToBeMoved; + for (auto &I : *From) { + if (auto *II = dyn_cast(&I)) { + addPseudoProbeAttribute(*II, PseudoProbeAttributes::Dangling); + ToBeMoved.push_back(II); + } + } + + for (auto *I : ToBeMoved) + I->moveBefore(To); + + return !ToBeMoved.empty(); +} } // namespace llvm diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -535,6 +535,11 @@ if (!Probe) return std::error_code(); + // Ignore danling probes since they are logically deleted and should do + // not consume any profile samples. + if (Probe->isDangling()) + return std::error_code(); + const FunctionSamples *FS = findFunctionSamples(Inst); if (!FS) return std::error_code(); diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp --- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp @@ -402,8 +402,15 @@ ProbeFactorMap ProbeFactors; for (auto &Block : F) { for (auto &I : Block) { - if (Optional Probe = extractProbe(I)) - ProbeFactors[Probe->Id] += BBProfileCount(&Block); + if (Optional Probe = extractProbe(I)) { + // Do not count dangling probes since they are logically deleted and the + // current block that a dangling probe resides in doesn't reflect the + // execution count of the probe. The original samples of the probe will + // be distributed among the rest probes if there are any, this is + // less-than-deal but at least we don't lose any samples. + if (!Probe->isDangling()) + ProbeFactors[Probe->Id] += BBProfileCount(&Block); + } } } @@ -411,9 +418,13 @@ for (auto &Block : F) { for (auto &I : Block) { if (Optional Probe = extractProbe(I)) { - float Sum = ProbeFactors[Probe->Id]; - if (Sum != 0) - setProbeDistributionFactor(I, BBProfileCount(&Block) / Sum); + // Ignore danling probes since they are logically deleted and should do + // not consume any profile samples in the subsequent profile annotation. + if (!Probe->isDangling()) { + float Sum = ProbeFactors[Probe->Id]; + if (Sum != 0) + setProbeDistributionFactor(I, BBProfileCount(&Block) / Sum); + } } } } diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -462,7 +462,7 @@ BasicBlock *Succ = BI->getSuccessor(0); if ( // The terminator must be the only non-phi instruction in BB. - BB.getFirstNonPHIOrDbg()->isTerminator() && + BB.getFirstNonPHIOrDbg(true)->isTerminator() && // Don't alter Loop headers and latches to ensure another pass can // detect and transform nested loops later. !LoopHeaders.count(&BB) && !LoopHeaders.count(Succ) && diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -65,6 +65,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/PseudoProbe.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -1100,6 +1101,12 @@ for (BasicBlock *Pred : predecessors(BB)) Pred->getTerminator()->setMetadata(LoopMDKind, LoopMD); + // For AutoFDO, since BB is going to be removed, we won't be able to sample + // it. To avoid assigning a zero weight for BB, move all its pseudo probes + // into Succ and mark them dangling. This should allow the counts inference a + // chance to get a more reasonable weight for BB. + moveAndDanglePseudoProbes(BB, &*Succ->getFirstInsertionPt()); + // Everything that jumped to BB now goes to Succ. BB->replaceAllUsesWith(Succ); if (!Succ->hasName()) Succ->takeName(BB); @@ -2775,6 +2782,13 @@ // TODO: Extend llvm.dbg.value to take more than one SSA Value (PR39141) to // encode predicated DIExpressions that yield different results on different // code paths. + + // A hoisted conditional probe should be treated as dangling so that it will + // not be over-counted when the samples collected on the non-conditional path + // are counted towards the conditional path. We leave it for the counts + // inference algorithm to figure out a proper count for a danglng probe. + moveAndDanglePseudoProbes(BB, InsertPt); + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { Instruction *I = &*II; I->dropUnknownNonDebugMetadata(); diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -57,6 +57,7 @@ #include "llvm/IR/NoFolder.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/PseudoProbe.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -2252,7 +2253,6 @@ // probability for ThenBB, which is fine since the optimization here takes // place regardless of the branch probability. if (isa(I)) { - SpeculatedDbgIntrinsics.push_back(I); continue; } @@ -2338,6 +2338,12 @@ I.dropUnknownNonDebugMetadata(); } + // A hoisted conditional probe should be treated as dangling so that it will + // not be over-counted when the samples collected on the non-conditional path + // are counted towards the conditional path. We leave it for the counts + // inference algorithm to figure out a proper count for a danglng probe. + moveAndDanglePseudoProbes(ThenBB, BI); + // Hoist the instructions. BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(), ThenBB->begin(), std::prev(ThenBB->end())); @@ -6225,7 +6231,7 @@ Options.NeedCanonicalLoop && (!LoopHeaders.empty() && BB->hasNPredecessorsOrMore(2) && (is_contained(LoopHeaders, BB) || is_contained(LoopHeaders, Succ))); - BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator(); + BasicBlock::iterator I = BB->getFirstNonPHIOrDbg(true)->getIterator(); if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() && !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB, DTU)) return true; @@ -6286,8 +6292,8 @@ return requestResimplify(); // This block must be empty, except for the setcond inst, if it exists. - // Ignore dbg intrinsics. - auto I = BB->instructionsWithoutDebug().begin(); + // Ignore dbg and pseudo intrinsics. + auto I = BB->instructionsWithoutDebug(true).begin(); if (&*I == BI) { if (FoldValueComparisonIntoPredecessors(BI, Builder)) return requestResimplify(); diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll @@ -0,0 +1,100 @@ +; REQUIRES: x86_64-linux +; RUN: opt < %s -passes='pseudo-probe,jump-threading' -S -o %t +; RUN: FileCheck %s < %t --check-prefix=JT +; RUN: llc -pseudo-probe-for-profiling -function-sections <%t -filetype=asm | FileCheck %s --check-prefix=ASM +; RUN: opt < %s -passes='pseudo-probe' -S -o %t1 +; RUN: llc -pseudo-probe-for-profiling -stop-after=tailduplication <%t1 | FileCheck %s --check-prefix=MIR-tail +; RUN: opt < %s -passes='pseudo-probe,simplifycfg' -S | FileCheck %s --check-prefix=SC + +declare i32 @f1() + +define i32 @foo(i1 %cond) { +; JT-LABEL: @foo( +; JT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 1, i32 0, i64 -1) +; ASM: pseudoprobe 6699318081062747564 1 0 0 + %call = call i32 @f1() + br i1 %cond, label %T, label %F +T: + br label %Merge +F: + br label %Merge +Merge: +;; Check branch T and F are gone, and their probes (probe 2 and 3) are dangling. +; JT-LABEL-NO: T +; JT-LABEL-NO: F +; JT-LABEL: Merge +; JT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 2, i64 -1) +; JT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 2, i64 -1) +; JT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1) +; ASM: .pseudoprobe 6699318081062747564 3 0 2 +; ASM: .pseudoprobe 6699318081062747564 2 0 2 +; ASM: .pseudoprobe 6699318081062747564 4 0 0 + ret i32 %call +} + +;; Check block T and F are gone, and their probes (probe 2 and 3) are dangling. +; MIR-tail: bb.0 +; MIR-tail: PSEUDO_PROBE [[#GUID:]], 1, 0, 0 +; MIR-tail: PSEUDO_PROBE [[#GUID:]], 2, 0, 2 +; MIR-tail: PSEUDO_PROBE [[#GUID:]], 3, 0, 2 +; MIR-tail: PSEUDO_PROBE [[#GUID:]], 4, 0, 0 + + +define void @foo2() { +bb: + %tmp = call i32 @f1() + %tmp1 = icmp eq i32 %tmp, 1 + br i1 %tmp1, label %bb5, label %bb8 + +bb2: + %tmp4 = icmp ne i32 %tmp, 1 + switch i1 %tmp4, label %bb2 [ + i1 0, label %bb5 + i1 1, label %bb8 + ] + +bb5: +;; Check the pseudo probe with id 3 only has one copy. +; JT-COUNT-1: call void @llvm.pseudoprobe(i64 [[#GUID2:]], i64 3, i32 2, i64 -1) + %tmp6 = phi i1 [ %tmp1, %bb ], [ false, %bb2 ] + br i1 %tmp6, label %bb8, label %bb7 + +bb7: + br label %bb8 + +bb8: + ret void +} + +define i32 @test(i32 %a, i32 %b, i32 %c) { +;; Check block bb1 and bb2 are gone, and their probes (probe 2 and 3) are dangling. +; SC-LABEL: @test( +; SC-LABEL-NO: bb1 +; SC-LABEL-NO: bb2 +; SC: [[T1:%.*]] = icmp eq i32 [[B:%.*]], 0 +; SC-DAG: call void @llvm.pseudoprobe(i64 [[#GUID3:]], i64 2, i32 2, i64 -1) +; SC-DAG: call void @llvm.pseudoprobe(i64 [[#GUID3]], i64 3, i32 2, i64 -1) +; SC: [[T2:%.*]] = icmp sgt i32 [[C:%.*]], 1 +; SC: [[T3:%.*]] = add i32 [[A:%.*]], 1 +; SC: [[SPEC_SELECT:%.*]] = select i1 [[T2]], i32 [[T3]], i32 [[A]] +; SC: [[T4:%.*]] = select i1 [[T1]], i32 [[SPEC_SELECT]], i32 [[B]] +; SC: [[T5:%.*]] = sub i32 [[T4]], 1 +; SC: ret i32 [[T5]] + +entry: + %t1 = icmp eq i32 %b, 0 + br i1 %t1, label %bb1, label %bb3 + +bb1: + %t2 = icmp sgt i32 %c, 1 + br i1 %t2, label %bb2, label %bb3 + +bb2: + %t3 = add i32 %a, 1 + br label %bb3 + +bb3: + %t4 = phi i32 [ %b, %entry ], [ %a, %bb1 ], [ %t3, %bb2 ] + %t5 = sub i32 %t4, 1 + ret i32 %t5 +}