diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -595,6 +595,10 @@ /// operands in the successor blocks which refer to FromMBB to refer to this. void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB); + /// move all pseudo probes in this block to the end of /c ToMBB To and tag + /// them dangling. + void moveAndDanglePseudoProbes(MachineBasicBlock *ToMBB); + /// Return true if any of the successors have probabilities attached to them. bool hasSuccessorProbabilities() const { return !Probs.empty(); } @@ -655,16 +659,18 @@ /// Returns an iterator to the first non-debug instruction in the basic block, /// or end(). - iterator getFirstNonDebugInstr(); - const_iterator getFirstNonDebugInstr() const { - return const_cast(this)->getFirstNonDebugInstr(); + iterator getFirstNonDebugInstr(bool SkipPseudoOp = false); + const_iterator getFirstNonDebugInstr(bool SkipPseudoOp = false) const { + return const_cast(this)->getFirstNonDebugInstr( + SkipPseudoOp); } /// Returns an iterator to the last non-debug instruction in the basic block, /// or end(). - iterator getLastNonDebugInstr(); - const_iterator getLastNonDebugInstr() const { - return const_cast(this)->getLastNonDebugInstr(); + iterator getLastNonDebugInstr(bool SkipPseudoOp = false); + const_iterator getLastNonDebugInstr(bool SkipPseudoOp = false) const { + return const_cast(this)->getLastNonDebugInstr( + SkipPseudoOp); } /// Convenience function that returns true if the block ends in a return @@ -1050,9 +1056,11 @@ /// and return the resulting iterator. This function should only be used /// MachineBasicBlock::{iterator, const_iterator, instr_iterator, /// const_instr_iterator} and the respective reverse iterators. -template -inline IterT skipDebugInstructionsForward(IterT It, IterT End) { - while (It != End && It->isDebugInstr()) +template +inline IterT skipDebugInstructionsForward(IterT It, IterT End, + bool SkipPseudoOp = false) { + while (It != End && + (It->isDebugInstr() || (SkipPseudoOp && It->isPseudoProbe()))) ++It; return It; } @@ -1061,31 +1069,36 @@ /// and return the resulting iterator. This function should only be used /// MachineBasicBlock::{iterator, const_iterator, instr_iterator, /// const_instr_iterator} and the respective reverse iterators. -template -inline IterT skipDebugInstructionsBackward(IterT It, IterT Begin) { - while (It != Begin && It->isDebugInstr()) +template +inline IterT skipDebugInstructionsBackward(IterT It, IterT Begin, + bool SkipPseudoOp = false) { + while (It != Begin && + (It->isDebugInstr() || (SkipPseudoOp && It->isPseudoProbe()))) --It; return It; } /// Increment \p It, then continue incrementing it while it points to a debug /// instruction. A replacement for std::next. -template inline IterT next_nodbg(IterT It, IterT End) { - return skipDebugInstructionsForward(std::next(It), End); +template +inline IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp = false) { + return skipDebugInstructionsForward(std::next(It), End, SkipPseudoOp); } /// Decrement \p It, then continue decrementing it while it points to a debug /// instruction. A replacement for std::prev. -template inline IterT prev_nodbg(IterT It, IterT Begin) { - return skipDebugInstructionsBackward(std::prev(It), Begin); +template +inline IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp = false) { + return skipDebugInstructionsBackward(std::prev(It), Begin, SkipPseudoOp); } /// Construct a range iterator which begins at \p It and moves forwards until /// \p End is reached, skipping any debug instructions. template -inline auto instructionsWithoutDebug(IterT It, IterT End) { - return make_filter_range(make_range(It, End), [](const MachineInstr &MI) { - return !MI.isDebugInstr(); +inline auto instructionsWithoutDebug(IterT It, IterT End, + bool SkipPseudoOp = false) { + return make_filter_range(make_range(It, End), [=](const MachineInstr &MI) { + return !MI.isDebugInstr() && !(SkipPseudoOp && MI.isPseudoProbe()); }); } diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -25,6 +25,7 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/PseudoProbe.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/ArrayRecycler.h" @@ -1156,6 +1157,10 @@ return getOpcode() == TargetOpcode::CFI_INSTRUCTION; } + bool isPseudoProbe() const { + return getOpcode() == TargetOpcode::PSEUDO_PROBE; + } + // True if the instruction represents a position in the function. bool isPosition() const { return isLabel() || isCFIInstruction(); } @@ -1793,6 +1798,19 @@ } } + PseudoProbeAttributes getPseudoProbeAttribute() const { + assert(isPseudoProbe()); + return (PseudoProbeAttributes)getOperand(3).getImm(); + } + + void addPseudoProbeAttribute(PseudoProbeAttributes Attr) { + assert(isPseudoProbe()); + MachineOperand &AttrOperand = getOperand(3); + uint32_t OrigAttr = AttrOperand.getImm(); + OrigAttr |= (uint32_t)Attr; + AttrOperand.setImm(OrigAttr); + } + private: /// If this instruction is embedded into a MachineFunction, return the /// MachineRegisterInfo object for the current function, otherwise diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h --- a/llvm/include/llvm/IR/Instruction.h +++ b/llvm/include/llvm/IR/Instruction.h @@ -650,6 +650,9 @@ /// llvm.lifetime.end marker. bool isLifetimeStartOrEnd() const; + /// Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst. + bool isDebugOrPseudoInst() const; + /// Return a pointer to the next non-debug instruction in the same basic /// block as 'this', or nullptr if no such instruction exists. Skip any pseudo /// operations if \c SkipPseudoOp is true. diff --git a/llvm/include/llvm/IR/PseudoProbe.h b/llvm/include/llvm/IR/PseudoProbe.h --- a/llvm/include/llvm/IR/PseudoProbe.h +++ b/llvm/include/llvm/IR/PseudoProbe.h @@ -27,6 +27,11 @@ enum class PseudoProbeType { Block = 0, IndirectCall, DirectCall }; +enum class PseudoProbeAttributes { + Reserved = 0x1, // Reserved for future use. + Dangling = 0x2, // The probe is dangling. +}; + // The saturated distrution factor representing 100% for block probes. constexpr static uint64_t PseudoProbeFullDistributionFactor = std::numeric_limits::max(); @@ -76,12 +81,19 @@ uint32_t Type; uint32_t Attr; float Factor; + + bool isDangling() const { + return Attr & (uint32_t)PseudoProbeAttributes::Dangling; + } }; Optional extractProbe(const Instruction &Inst); void setProbeDistributionFactor(Instruction &Inst, float Factor); +bool moveAndDanglePseudoProbes(BasicBlock *From, Instruction *To); + +bool RemoveRedundantPseudoProbes(BasicBlock *Block); } // end namespace llvm #endif // LLVM_IR_PSEUDOPROBE_H diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h --- a/llvm/include/llvm/MC/MCPseudoProbe.h +++ b/llvm/include/llvm/MC/MCPseudoProbe.h @@ -27,7 +27,7 @@ // TYPE (uint4) // 0 - block probe, 1 - indirect call, 2 - direct call // ATTRIBUTE (uint3) -// reserved +// 1 - reserved, 2 - dangling // ADDRESS_TYPE (uint1) // 0 - code address, 1 - address delta // CODE_ADDRESS (uint64 or ULEB128) diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -579,6 +579,11 @@ return 0; return std::error_code(); } else { + // Return error for an invalid sample count which is usually assigned to + // dangling probe. + if (FunctionSamples::ProfileIsProbeBased && + ret->second.getSamples() == InvalidProbeCount) + return std::error_code(); return ret->second.getSamples(); } } @@ -845,6 +850,8 @@ const DILocation *DIL, SampleProfileReaderItaniumRemapper *Remapper = nullptr) const; + static constexpr uint64_t InvalidProbeCount = UINT64_MAX; + static bool ProfileIsProbeBased; static bool ProfileIsCS; diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h --- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h +++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h @@ -109,8 +109,9 @@ DomTreeUpdater *DTU = nullptr, LoopInfo *LI = nullptr); /// Try to remove redundant dbg.value instructions from given basic block. -/// Returns true if at least one instruction was removed. -bool RemoveRedundantDbgInstrs(BasicBlock *BB); +/// Returns true if at least one instruction was removed. Remove redundant +/// pseudo ops when RemovePseudoOp is true. +bool RemoveRedundantDbgInstrs(BasicBlock *BB, bool RemovePseudoOp = false); /// Replace all uses of an instruction (specified by BI) with a value, then /// remove and delete the original instruction. diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -1217,7 +1217,7 @@ // Blocks should be considered empty if they contain only debug info; // else the debug info would affect codegen. static bool IsEmptyBlock(MachineBasicBlock *MBB) { - return MBB->getFirstNonDebugInstr() == MBB->end(); + return MBB->getFirstNonDebugInstr(true) == MBB->end(); } // Blocks with only debug info and branches should be considered the same @@ -1307,6 +1307,16 @@ for (MachineBasicBlock *PredBB : MBB.predecessors()) if (PredBB->succ_size() == 1) copyDebugInfoToPredecessor(TII, MBB, *PredBB); + + // For AutoFDO, if the block is removed, we won't be able to sample it. To + // avoid assigning a zero weight for BB, move all its pseudo probes into once + // of its predecessors or successors and mark them dangling. This should allow + // the counts inference a chance to get a more reasonable weight for the + // block. + if (!MBB.pred_empty()) + MBB.moveAndDanglePseudoProbes(*MBB.pred_begin()); + else if (!MBB.succ_empty()) + MBB.moveAndDanglePseudoProbes(*MBB.succ_begin()); } bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -243,12 +243,14 @@ return I; } -MachineBasicBlock::iterator MachineBasicBlock::getFirstNonDebugInstr() { +MachineBasicBlock::iterator +MachineBasicBlock::getFirstNonDebugInstr(bool SkipPseudoOp) { // Skip over begin-of-block dbg_value instructions. - return skipDebugInstructionsForward(begin(), end()); + return skipDebugInstructionsForward(begin(), end(), SkipPseudoOp); } -MachineBasicBlock::iterator MachineBasicBlock::getLastNonDebugInstr() { +MachineBasicBlock::iterator +MachineBasicBlock::getLastNonDebugInstr(bool SkipPseudoOp) { // Skip over end-of-block dbg_value instructions. instr_iterator B = instr_begin(), I = instr_end(); while (I != B) { @@ -256,6 +258,8 @@ // Return instruction that starts a bundle. if (I->isDebugInstr() || I->isInsideBundle()) continue; + if (SkipPseudoOp && I->isPseudoProbe()) + continue; return I; } // The block is all debug values. @@ -883,6 +887,32 @@ normalizeSuccProbs(); } +/// A block emptied (i.e., with all instructions moved out of it) won't be +/// sampled at run time. In such cases, AutoFDO will be informed of zero samples +/// collected for the block. This is not accurate and could lead to misleading +/// weights assigned for the block. A way to mitigate that is to treat such +/// block as having unknown counts in the AutoFDO profile loader and allow the +/// counts inference tool a chance to calculate a relatively reasonable weight +/// for it. This can be done by moving all pseudo probes in the emptied block +/// i.e, /c this, to before /c ToMBB and tag them dangling. Note that this is +/// not needed for dead blocks which really have a zero weight. It's per +/// transforms to decide whether to call this function or not. +void MachineBasicBlock::moveAndDanglePseudoProbes(MachineBasicBlock *ToMBB) { + SmallVector ToBeMoved; + for (MachineInstr &MI : instrs()) { + if (MI.isPseudoProbe()) { + MI.addPseudoProbeAttribute(PseudoProbeAttributes::Dangling); + ToBeMoved.push_back(&MI); + } + } + + MachineBasicBlock::iterator I = ToMBB->getFirstTerminator(); + for (MachineInstr *MI : ToBeMoved) { + MI->removeFromParent(); + ToMBB->insert(I, MI); + } +} + bool MachineBasicBlock::isPredecessor(const MachineBasicBlock *MBB) const { return is_contained(predecessors(), MBB); } diff --git a/llvm/lib/CodeGen/PseudoProbeInserter.cpp b/llvm/lib/CodeGen/PseudoProbeInserter.cpp --- a/llvm/lib/CodeGen/PseudoProbeInserter.cpp +++ b/llvm/lib/CodeGen/PseudoProbeInserter.cpp @@ -20,8 +20,9 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/PseudoProbe.h" #include "llvm/InitializePasses.h" +#include "llvm/MC/MCPseudoProbe.h" #include "llvm/Target/TargetMachine.h" -#include +#include #define DEBUG_TYPE "pseudo-probe-inserter" @@ -47,7 +48,10 @@ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); bool Changed = false; for (MachineBasicBlock &MBB : MF) { + MachineInstr *FirstInstr = nullptr; for (MachineInstr &MI : MBB) { + if (!MI.isPseudo()) + FirstInstr = &MI; if (MI.isCall()) { if (DILocation *DL = MI.getDebugLoc()) { auto Value = DL->getDiscriminator(); @@ -65,8 +69,84 @@ } } } + + // Walk the block backwards, move PSEUDO_PROBE before the first real + // instruction to fix out-of-order probes. There is a problem with probes + // as the terminator of the block. During the offline counts processing, + // the samples collected on the first physical instruction following a + // probe will be counted towards the probe. This logically equals to + // treating the instruction next to a probe as if it is from the same + // block of the probe. This is accurate most of the time unless the + // instruction can be reached from multiple flows, which means it actually + // starts a new block. Samples collected on such probes may cause + // imprecision with the counts inference algorithm. Fortunately, if + // there are still other native instructions preceding the probe we can + // use them as a place holder to collect samples for the probe. + if (FirstInstr) { + auto MII = MBB.rbegin(); + while (MII != MBB.rend()) { + // Skip all pseudo probes followed by a real instruction since they + // are not dangling. + if (!MII->isPseudo()) + break; + auto Cur = MII++; + if (Cur->getOpcode() != TargetOpcode::PSEUDO_PROBE) + continue; + // Move the dangling probe before FirstInstr. + auto ProbeInstr = &*Cur; + MBB.remove(ProbeInstr); + MBB.insert(FirstInstr, ProbeInstr); + Changed = true; + } + } else { + // Probes not surrounded by any real instructions in the same block are + // called dangling probes. Since there's no good way to pick up a sample + // collection point for dangling probes at compile time, they are being + // tagged so that the profile correlation tool will not report any + // samples collected for them and it's up to the counts inference tool + // to get them a reasonable count. + for (MachineInstr &MI : MBB) { + if (MI.isPseudoProbe()) + MI.addPseudoProbeAttribute(PseudoProbeAttributes::Dangling); + } + } + } + + // Remove redundant dangling probes. Same dangling probes are redundant + // since they all have the same semantic that is to rely on the counts + // inference too to get reasonable count for the same original block. + // Therefore, there's no need to keep multiple copies of them. + auto Hash = [](const MachineInstr *MI) { + return std::hash()(MI->getOperand(0).getImm()) ^ + std::hash()(MI->getOperand(1).getImm()); + }; + + auto IsEqual = [](const MachineInstr *Left, const MachineInstr *Right) { + return Left->getOperand(0).getImm() == Right->getOperand(0).getImm() && + Left->getOperand(1).getImm() == Right->getOperand(1).getImm() && + Left->getOperand(3).getImm() == Right->getOperand(3).getImm() && + Left->getDebugLoc() == Right->getDebugLoc(); + }; + + SmallVector ToBeRemoved; + std::unordered_set + DanglingProbes(0, Hash, IsEqual); + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.isPseudoProbe()) { + if ((uint32_t)MI.getPseudoProbeAttribute() & + (uint32_t)PseudoProbeAttributes::Dangling) + if (!DanglingProbes.insert(&MI).second) + ToBeRemoved.push_back(&MI); + } + } } + for (auto *MI : ToBeRemoved) + MI->eraseFromParent(); + + Changed |= !ToBeRemoved.empty(); return Changed; } diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -683,7 +683,7 @@ return false; if (TailBB->pred_empty()) return false; - MachineBasicBlock::iterator I = TailBB->getFirstNonDebugInstr(); + MachineBasicBlock::iterator I = TailBB->getFirstNonDebugInstr(true); if (I == TailBB->end()) return true; return I->isUnconditionalBranch(); @@ -779,6 +779,12 @@ assert(PredBB->succ_size() <= 1); } + // For AutoFDO, since BB is going to be removed, we won't be able to sample + // it. To avoid assigning a zero weight for BB, move all its pseudo probes + // into Succ and mark them dangling. This should allow the counts inference + // a chance to get a more reasonable weight for BB. + TailBB->moveAndDanglePseudoProbes(PredBB); + if (PredTBB) TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DL); diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -641,6 +641,10 @@ return ID == Intrinsic::lifetime_start || ID == Intrinsic::lifetime_end; } +bool Instruction::isDebugOrPseudoInst() const { + return isa(this) || isa(this); +} + const Instruction * Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const { for (const Instruction *I = getNextNode(); I; I = I->getNextNode()) diff --git a/llvm/lib/IR/PseudoProbe.cpp b/llvm/lib/IR/PseudoProbe.cpp --- a/llvm/lib/IR/PseudoProbe.cpp +++ b/llvm/lib/IR/PseudoProbe.cpp @@ -15,6 +15,7 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" +#include using namespace llvm; @@ -96,4 +97,74 @@ } } } + +void addPseudoProbeAttribute(PseudoProbeInst &Inst, + PseudoProbeAttributes Attr) { + IRBuilder<> Builder(&Inst); + uint32_t OldAttr = Inst.getAttributes()->getZExtValue(); + uint32_t NewAttr = OldAttr | (uint32_t)Attr; + if (OldAttr != NewAttr) + Inst.replaceUsesOfWith(Inst.getAttributes(), Builder.getInt32(NewAttr)); +} + +/// A block emptied (i.e., with all instructions moved out of it) won't be +/// sampled at run time. In such cases, AutoFDO will be informed of zero samples +/// collected for the block. This is not accurate and could lead to misleading +/// weights assigned for the block. A way to mitigate that is to treat such +/// block as having unknown counts in the AutoFDO profile loader and allow the +/// counts inference tool a chance to calculate a relatively reasonable weight +/// for it. This can be done by moving all pseudo probes in the emptied block +/// i.e, /c From, to before /c To and tag them dangling. Note that this is +/// not needed for dead blocks which really have a zero weight. It's per +/// transforms to decide whether to call this function or not. +bool moveAndDanglePseudoProbes(BasicBlock *From, Instruction *To) { + SmallVector ToBeMoved; + for (auto &I : *From) { + if (auto *II = dyn_cast(&I)) { + addPseudoProbeAttribute(*II, PseudoProbeAttributes::Dangling); + ToBeMoved.push_back(II); + } + } + + for (auto *I : ToBeMoved) + I->moveBefore(To); + + return !ToBeMoved.empty(); +} + +/// Same dangling probes in one blocks are redundant since they all have the +/// same semantic that is to rely on the counts inference too to get reasonable +/// count for the same original block. Therefore, there's no need to keep +/// multiple copies of them. +bool RemoveRedundantPseudoProbes(BasicBlock *Block) { + + auto Hash = [](const PseudoProbeInst *I) { + return std::hash()(I->getFuncGuid()->getZExtValue()) ^ + std::hash()(I->getIndex()->getZExtValue()); + }; + + auto IsEqual = [](const PseudoProbeInst *Left, const PseudoProbeInst *Right) { + return Left->getFuncGuid() == Right->getFuncGuid() && + Left->getIndex() == Right->getIndex() && + Left->getAttributes() == Right->getAttributes() && + Left->getDebugLoc() == Right->getDebugLoc(); + }; + + SmallVector ToBeRemoved; + std::unordered_set + DanglingProbes(0, Hash, IsEqual); + + for (auto &I : *Block) { + if (auto *II = dyn_cast(&I)) { + if (II->getAttributes()->getZExtValue() & + (uint32_t)PseudoProbeAttributes::Dangling) + if (!DanglingProbes.insert(II).second) + ToBeRemoved.push_back(II); + } + } + + for (auto *I : ToBeRemoved) + I->eraseFromParent(); + return !ToBeRemoved.empty(); +} } // namespace llvm diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -881,6 +881,11 @@ if (!Probe) return std::error_code(); + // Ignore danling probes since they are logically deleted and should do + // not consume any profile samples. + if (Probe->isDangling()) + return std::error_code(); + const FunctionSamples *FS = findFunctionSamples(Inst); if (!FS) return std::error_code(); diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp --- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include #include #include @@ -402,8 +403,15 @@ ProbeFactorMap ProbeFactors; for (auto &Block : F) { for (auto &I : Block) { - if (Optional Probe = extractProbe(I)) - ProbeFactors[Probe->Id] += BBProfileCount(&Block); + if (Optional Probe = extractProbe(I)) { + // Do not count dangling probes since they are logically deleted and the + // current block that a dangling probe resides in doesn't reflect the + // execution count of the probe. The original samples of the probe will + // be distributed among the rest probes if there are any, this is + // less-than-deal but at least we don't lose any samples. + if (!Probe->isDangling()) + ProbeFactors[Probe->Id] += BBProfileCount(&Block); + } } } @@ -411,9 +419,13 @@ for (auto &Block : F) { for (auto &I : Block) { if (Optional Probe = extractProbe(I)) { - float Sum = ProbeFactors[Probe->Id]; - if (Sum != 0) - setProbeDistributionFactor(I, BBProfileCount(&Block) / Sum); + // Ignore danling probes since they are logically deleted and should do + // not consume any profile samples in the subsequent profile annotation. + if (!Probe->isDangling()) { + float Sum = ProbeFactors[Probe->Id]; + if (Sum != 0) + setProbeDistributionFactor(I, BBProfileCount(&Block) / Sum); + } } } } diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -433,8 +433,9 @@ // Jump threading may have introduced redundant debug values into BB // which should be removed. + // Remove redundant pseudo probes as well. if (Changed) - RemoveRedundantDbgInstrs(&BB); + RemoveRedundantDbgInstrs(&BB, true); // Stop processing BB if it's the entry or is now deleted. The following // routines attempt to eliminate BB and locating a suitable replacement @@ -462,12 +463,12 @@ BasicBlock *Succ = BI->getSuccessor(0); if ( // The terminator must be the only non-phi instruction in BB. - BB.getFirstNonPHIOrDbg()->isTerminator() && + BB.getFirstNonPHIOrDbg(true)->isTerminator() && // Don't alter Loop headers and latches to ensure another pass can // detect and transform nested loops later. !LoopHeaders.count(&BB) && !LoopHeaders.count(Succ) && TryToSimplifyUncondBranchFromEmptyBlock(&BB, DTU)) { - RemoveRedundantDbgInstrs(Succ); + RemoveRedundantDbgInstrs(Succ, true); // BB is valid for cleanup here because we passed in DTU. F remains // BB's parent until a DTU->getDomTree() event. LVI->eraseBlock(&BB); diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PseudoProbe.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -433,7 +434,7 @@ return !ToBeRemoved.empty(); } -bool llvm::RemoveRedundantDbgInstrs(BasicBlock *BB) { +bool llvm::RemoveRedundantDbgInstrs(BasicBlock *BB, bool RemovePseudoOp) { bool MadeChanges = false; // By using the "backward scan" strategy before the "forward scan" strategy we // can remove both dbg.value (2) and (3) in a situation like this: @@ -448,6 +449,8 @@ // already is described as having the value V1 at (1). MadeChanges |= removeRedundantDbgInstrsUsingBackwardScan(BB); MadeChanges |= removeRedundantDbgInstrsUsingForwardScan(BB); + if (RemovePseudoOp) + MadeChanges |= RemoveRedundantPseudoProbes(BB); if (MadeChanges) LLVM_DEBUG(dbgs() << "Removed redundant dbg instrs from: " diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -65,6 +65,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/PseudoProbe.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -1107,6 +1108,12 @@ Pred->getTerminator()->setMetadata(LoopMDKind, LoopMD); } + // For AutoFDO, since BB is going to be removed, we won't be able to sample + // it. To avoid assigning a zero weight for BB, move all its pseudo probes + // into Succ and mark them dangling. This should allow the counts inference a + // chance to get a more reasonable weight for BB. + moveAndDanglePseudoProbes(BB, &*Succ->getFirstInsertionPt()); + // Everything that jumped to BB now goes to Succ. BB->replaceAllUsesWith(Succ); if (!Succ->hasName()) Succ->takeName(BB); @@ -2782,6 +2789,13 @@ // TODO: Extend llvm.dbg.value to take more than one SSA Value (PR39141) to // encode predicated DIExpressions that yield different results on different // code paths. + + // A hoisted conditional probe should be treated as dangling so that it will + // not be over-counted when the samples collected on the non-conditional path + // are counted towards the conditional path. We leave it for the counts + // inference algorithm to figure out a proper count for a danglng probe. + moveAndDanglePseudoProbes(BB, InsertPt); + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { Instruction *I = &*II; I->dropUnknownNonDebugMetadata(); diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -57,6 +57,7 @@ #include "llvm/IR/NoFolder.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/PseudoProbe.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -2252,7 +2253,6 @@ // probability for ThenBB, which is fine since the optimization here takes // place regardless of the branch probability. if (isa(I)) { - SpeculatedDbgIntrinsics.push_back(I); continue; } @@ -2338,6 +2338,12 @@ I.dropUnknownNonDebugMetadata(); } + // A hoisted conditional probe should be treated as dangling so that it will + // not be over-counted when the samples collected on the non-conditional path + // are counted towards the conditional path. We leave it for the counts + // inference algorithm to figure out a proper count for a danglng probe. + moveAndDanglePseudoProbes(ThenBB, BI); + // Hoist the instructions. BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(), ThenBB->begin(), std::prev(ThenBB->end())); @@ -6228,7 +6234,7 @@ Options.NeedCanonicalLoop && (!LoopHeaders.empty() && BB->hasNPredecessorsOrMore(2) && (is_contained(LoopHeaders, BB) || is_contained(LoopHeaders, Succ))); - BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator(); + BasicBlock::iterator I = BB->getFirstNonPHIOrDbg(true)->getIterator(); if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() && !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB, DTU)) return true; @@ -6289,8 +6295,8 @@ return requestResimplify(); // This block must be empty, except for the setcond inst, if it exists. - // Ignore dbg intrinsics. - auto I = BB->instructionsWithoutDebug().begin(); + // Ignore dbg and pseudo intrinsics. + auto I = BB->instructionsWithoutDebug(true).begin(); if (&*I == BI) { if (FoldValueComparisonIntoPredecessors(BI, Builder)) return requestResimplify(); diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll @@ -0,0 +1,65 @@ +; REQUIRES: x86_64-linux +; RUN: opt < %s -passes='pseudo-probe,jump-threading' -S -o %t +; RUN: FileCheck %s < %t --check-prefix=IR +; RUN: llc -pseudo-probe-for-profiling -function-sections <%t -filetype=asm | FileCheck %s --check-prefix=ASM +; RUN: opt < %s -passes='pseudo-probe' -S -o %t1 +; RUN: llc -pseudo-probe-for-profiling -stop-after=tailduplication <%t1 | FileCheck %s --check-prefix=MIR-tail + +declare i32 @f1() + +define i32 @foo(i1 %cond) { +; IR: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 1, i32 0, i64 -1) +; ASM: pseudoprobe 6699318081062747564 1 0 0 + %call = call i32 @f1() + br i1 %cond, label %T, label %F +T: + br label %Merge +F: + br label %Merge +Merge: +;; Check branch T and F are gone, and their probes (probe 2 and 3) are dangling. +; IR-LABEL-NO: T +; IR-LABEL-NO: F +; IR-LABEL: Merge +; IR: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 2, i64 -1) +; IR: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 2, i64 -1) +; IR: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1) +; ASM: .pseudoprobe 6699318081062747564 3 0 2 +; ASM: .pseudoprobe 6699318081062747564 2 0 2 +; ASM: .pseudoprobe 6699318081062747564 4 0 0 + ret i32 %call +} + +;; Check block T and F are gone, and their probes (probe 2 and 3) are dangling. +; MIR-tail: bb.0 +; MIR-tail: PSEUDO_PROBE [[#GUID:]], 1, 0, 0 +; MIR-tail: PSEUDO_PROBE [[#GUID:]], 2, 0, 2 +; MIR-tail: PSEUDO_PROBE [[#GUID:]], 3, 0, 2 +; MIR-tail: PSEUDO_PROBE [[#GUID:]], 4, 0, 0 + + +define void @foo2() { +bb: + %tmp = call i32 @f1() + %tmp1 = icmp eq i32 %tmp, 1 + br i1 %tmp1, label %bb5, label %bb8 + +bb2: + %tmp4 = icmp ne i32 %tmp, 1 + switch i1 %tmp4, label %bb2 [ + i1 0, label %bb5 + i1 1, label %bb8 + ] + +bb5: +;; Check the pseudo probe with id 3 only has one copy. +; IR-COUNT-1: call void @llvm.pseudoprobe(i64 [[#GUID2:]], i64 3, i32 2, i64 -1) + %tmp6 = phi i1 [ %tmp1, %bb ], [ false, %bb2 ] + br i1 %tmp6, label %bb8, label %bb7 + +bb7: + br label %bb8 + +bb8: + ret void +} diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-dedup.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-dedup.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-dedup.ll @@ -0,0 +1,40 @@ +; REQUIRES: x86_64-linux +; RUN: llc -pseudo-probe-for-profiling %s -filetype=asm -o - | FileCheck %s + +declare i32 @f1() +declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0 + +define void @foo2() { +bb: +; CHECK: .pseudoprobe 2494702099028631698 1 0 0 + call void @llvm.pseudoprobe(i64 2494702099028631698, i64 1, i32 0, i64 -1) + %tmp = call i32 @f1() + %tmp1 = icmp eq i32 %tmp, 1 + br i1 %tmp1, label %bb5, label %bb8 + +bb2: +;; Check the pseudo probe with id 2 only has one copy. +; CHECK-COUNT-1: .pseudoprobe 2494702099028631698 2 0 2 + call void @llvm.pseudoprobe(i64 2494702099028631698, i64 2, i32 2, i64 -1) + %tmp4 = icmp ne i32 %tmp, 1 + call void @llvm.pseudoprobe(i64 2494702099028631698, i64 2, i32 2, i64 -1) + switch i1 %tmp4, label %bb2 [ + i1 0, label %bb5 + i1 1, label %bb8 + ] + +bb5: + %tmp6 = phi i1 [ %tmp1, %bb ], [ false, %bb2 ] + call void @llvm.pseudoprobe(i64 2494702099028631698, i64 2, i32 2, i64 -1) + br i1 %tmp6, label %bb8, label %bb7 + +bb7: + br label %bb8 + +bb8: + ret void +} + +!llvm.pseudo_probe_desc = !{!0} + +!0 = !{i64 2494702099028631698, i64 281612674956943, !"foo2", null} \ No newline at end of file