diff --git a/llvm/include/llvm/IR/PseudoProbe.h b/llvm/include/llvm/IR/PseudoProbe.h --- a/llvm/include/llvm/IR/PseudoProbe.h +++ b/llvm/include/llvm/IR/PseudoProbe.h @@ -21,6 +21,7 @@ namespace llvm { class Instruction; +class DILocation; constexpr const char *PseudoProbeDescMetadataName = "llvm.pseudo_probe_desc"; @@ -78,10 +79,22 @@ constexpr static uint8_t FullDistributionFactor = 100; }; +class PseudoProbeDescriptor { + uint64_t FunctionGUID; + uint64_t FunctionHash; + +public: + PseudoProbeDescriptor(uint64_t GUID, uint64_t Hash) + : FunctionGUID(GUID), FunctionHash(Hash) {} + uint64_t getFunctionGUID() const { return FunctionGUID; } + uint64_t getFunctionHash() const { return FunctionHash; } +}; + struct PseudoProbe { uint32_t Id; uint32_t Type; uint32_t Attr; + uint32_t Discriminator; // Distribution factor that estimates the portion of the real execution count. // A saturated distribution factor stands for 1.0 or 100%. A pesudo probe has // a factor with the value ranged from 0.0 to 1.0. diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h --- a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h +++ b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h @@ -40,16 +40,6 @@ pair_hash>; using FuncProbeFactorMap = StringMap; -class PseudoProbeDescriptor { - uint64_t FunctionGUID; - uint64_t FunctionHash; - -public: - PseudoProbeDescriptor(uint64_t GUID, uint64_t Hash) - : FunctionGUID(GUID), FunctionHash(Hash) {} - uint64_t getFunctionGUID() const { return FunctionGUID; } - uint64_t getFunctionHash() const { return FunctionHash; } -}; // A pseudo probe verifier that can be run after each IR passes to detect the // violation of updating probe factors. In principle, the sum of distribution @@ -78,20 +68,6 @@ const ProbeFactorMap &ProbeFactors); }; -// This class serves sample counts correlation for SampleProfileLoader by -// analyzing pseudo probes and their function descriptors injected by -// SampleProfileProber. -class PseudoProbeManager { - DenseMap GUIDToProbeDescMap; - - const PseudoProbeDescriptor *getDesc(const Function &F) const; - -public: - PseudoProbeManager(const Module &M); - bool moduleIsProbed(const Module &M) const; - bool profileIsValid(const Function &F, const FunctionSamples &Samples) const; -}; - /// Sample profile pseudo prober. /// /// Insert pseudo probes for block sampling and value sampling. diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h --- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h +++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h @@ -34,6 +34,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PseudoProbe.h" #include "llvm/ProfileData/SampleProf.h" #include "llvm/ProfileData/SampleProfReader.h" #include "llvm/Support/CommandLine.h" @@ -80,6 +81,55 @@ } // end namespace afdo_detail +// This class serves sample counts correlation for SampleProfileLoader by +// analyzing pseudo probes and their function descriptors injected by +// SampleProfileProber. +class PseudoProbeManager { + DenseMap GUIDToProbeDescMap; + + const PseudoProbeDescriptor *getDesc(const Function &F) const { + auto I = GUIDToProbeDescMap.find( + Function::getGUID(FunctionSamples::getCanonicalFnName(F))); + return I == GUIDToProbeDescMap.end() ? nullptr : &I->second; + } + +public: + PseudoProbeManager(const Module &M) { + if (NamedMDNode *FuncInfo = + M.getNamedMetadata(PseudoProbeDescMetadataName)) { + for (const auto *Operand : FuncInfo->operands()) { + const auto *MD = cast(Operand); + auto GUID = mdconst::dyn_extract(MD->getOperand(0)) + ->getZExtValue(); + auto Hash = mdconst::dyn_extract(MD->getOperand(1)) + ->getZExtValue(); + GUIDToProbeDescMap.try_emplace(GUID, PseudoProbeDescriptor(GUID, Hash)); + } + } + } + + bool moduleIsProbed(const Module &M) const { + return M.getNamedMetadata(PseudoProbeDescMetadataName); + } + + bool profileIsValid(const Function &F, const FunctionSamples &Samples) const { + const auto *Desc = getDesc(F); + if (!Desc) { + LLVM_DEBUG(dbgs() << "Probe descriptor missing for Function " + << F.getName() << "\n"); + return false; + } + if (Desc->getFunctionHash() != Samples.getFunctionHash()) { + LLVM_DEBUG(dbgs() << "Hash mismatch for Function " << F.getName() + << "\n"); + return false; + } + return true; + } +}; + + + extern cl::opt SampleProfileUseProfi; template class SampleProfileLoaderBaseImpl { @@ -137,6 +187,7 @@ unsigned getFunctionLoc(FunctionT &Func); virtual ErrorOr getInstWeight(const InstructionT &Inst); ErrorOr getInstWeightImpl(const InstructionT &Inst); + virtual ErrorOr getProbeWeight(const InstructionT &Inst); ErrorOr getBlockWeight(const BasicBlockT *BB); mutable DenseMap DILocation2SampleMap; @@ -212,6 +263,9 @@ /// Profile reader object. std::unique_ptr Reader; + // A pseudo probe helper to correlate the imported sample counts. + std::unique_ptr ProbeManager; + /// Samples collected for the body of this function. FunctionSamples *Samples = nullptr; @@ -299,6 +353,8 @@ template ErrorOr SampleProfileLoaderBaseImpl::getInstWeight(const InstructionT &Inst) { + if (FunctionSamples::ProfileIsProbeBased) + return getProbeWeight(Inst); return getInstWeightImpl(Inst); } @@ -346,6 +402,65 @@ return R; } +// Here use error_code to represent: 1) The dangling probe. 2) Ignore the weight +// of non-probe instruction. So if all instructions of the BB give error_code, +// tell the inference algorithm to infer the BB weight. +template +ErrorOr +SampleProfileLoaderBaseImpl::getProbeWeight(const InstructionT &Inst) { + assert(FunctionSamples::ProfileIsProbeBased && + "Profile is not pseudo probe based"); + std::optional Probe = extractProbe(Inst); + // Ignore the non-probe instruction. If none of the instruction in the BB is + // probe, we choose to infer the BB's weight. + if (!Probe) + return std::error_code(); + + const FunctionSamples *FS = findFunctionSamples(Inst); + // If none of the instruction has FunctionSample, we choose to return zero + // value sample to indicate the BB is cold. This could happen when the + // instruction is from inlinee and no profile data is found. + // FIXME: This should not be affected by the source drift issue as 1) if the + // newly added function is top-level inliner, it won't match the CFG checksum + // in the function profile or 2) if it's the inlinee, the inlinee should have + // a profile, otherwise it wouldn't be inlined. For non-probe based profile, + // we can improve it by adding a switch for profile-sample-block-accurate for + // block level counts in the future. + if (!FS) + return 0; + + auto R = FS->findSamplesAt(Probe->Id, Probe->Discriminator); + if (R) { + uint64_t Samples = R.get() * Probe->Factor; + bool FirstMark = CoverageTracker.markSamplesUsed(FS, Probe->Id, 0, Samples); + if (FirstMark) { + ORE->emit([&]() { + OptRemarkAnalysisT Remark(DEBUG_TYPE, "AppliedSamples", &Inst); + Remark << "Applied " << ore::NV("NumSamples", Samples); + Remark << " samples from profile (ProbeId="; + Remark << ore::NV("ProbeId", Probe->Id); + if (Probe->Discriminator) { + Remark << "."; + Remark << ore::NV("Discriminator", Probe->Discriminator); + } + Remark << ", Factor="; + Remark << ore::NV("Factor", Probe->Factor); + Remark << ", OriginalSamples="; + Remark << ore::NV("OriginalSamples", R.get()); + Remark << ")"; + return Remark; + }); + } + LLVM_DEBUG({dbgs() << " " << Probe->Id; + if (Probe->Discriminator) + dbgs() << "." << Probe->Discriminator; + dbgs() << ":" << Inst << " - weight: " << R.get() + << " - factor: " << format("%0.2f", Probe->Factor) << ")\n";}); + return Samples; + } + return R; +} + /// Compute the weight of a basic block. /// /// The weight of basic block \p BB is the maximum weight of all the diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp --- a/llvm/lib/CodeGen/MIRSampleProfile.cpp +++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp @@ -18,11 +18,13 @@ #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/PseudoProbe.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -30,6 +32,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" +#include using namespace llvm; using namespace sampleprof; @@ -92,6 +95,22 @@ // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= extern cl::opt ViewBlockFreqFuncName; +std::optional extractProbe(const MachineInstr &MI) { + if (MI.isPseudoProbe()) { + PseudoProbe Probe; + Probe.Id = MI.getOperand(1).getImm(); + Probe.Type = MI.getOperand(2).getImm(); + Probe.Attr = MI.getOperand(3).getImm(); + Probe.Factor = 1; + DILocation *DebugLoc = MI.getDebugLoc(); + Probe.Discriminator = DebugLoc ? DebugLoc->getDiscriminator() : 0; + return Probe; + } + + // Ignore callsite probes since they do not have FS discriminators. + return std::nullopt; +} + namespace afdo_detail { template <> struct IRTraits { using InstructionT = MachineInstr; @@ -167,6 +186,8 @@ bool ProfileIsValid = true; ErrorOr getInstWeight(const MachineInstr &MI) override { + if (FunctionSamples::ProfileIsProbeBased) + return getProbeWeight(MI); if (ImprovedFSDiscriminator && MI.isMetaInstruction()) return std::error_code(); return getInstWeightImpl(MI); @@ -275,6 +296,14 @@ Reader->setModule(&M); ProfileIsValid = (Reader->read() == sampleprof_error::success); + // Load pseudo probe descriptors for probe-based function samples. + if (Reader->profileIsProbeBased()) { + ProbeManager = std::make_unique(M); + if (!ProbeManager->moduleIsProbed(M)) { + return false; + } + } + return true; } @@ -285,8 +314,13 @@ if (!Samples || Samples->empty()) return false; - if (getFunctionLoc(MF) == 0) - return false; + if (FunctionSamples::ProfileIsProbeBased) { + if (!ProbeManager->profileIsValid(MF.getFunction(), *Samples)) + return false; + } else { + if (getFunctionLoc(MF) == 0) + return false; + } DenseSet InlinedGUIDs; bool Changed = computeAndPropagateWeights(MF, InlinedGUIDs); diff --git a/llvm/lib/IR/PseudoProbe.cpp b/llvm/lib/IR/PseudoProbe.cpp --- a/llvm/lib/IR/PseudoProbe.cpp +++ b/llvm/lib/IR/PseudoProbe.cpp @@ -22,12 +22,8 @@ namespace llvm { std::optional -extractProbeFromDiscriminator(const Instruction &Inst) { - assert(isa(&Inst) && !isa(&Inst) && - "Only call instructions should have pseudo probe encodes as their " - "Dwarf discriminators"); - if (const DebugLoc &DLoc = Inst.getDebugLoc()) { - const DILocation *DIL = DLoc; +extractProbeFromDiscriminator(const DILocation *DIL) { + if (DIL) { auto Discriminator = DIL->getDiscriminator(); if (DILocation::isPseudoProbeDiscriminator(Discriminator)) { PseudoProbe Probe; @@ -40,12 +36,23 @@ Probe.Factor = PseudoProbeDwarfDiscriminator::extractProbeFactor(Discriminator) / (float)PseudoProbeDwarfDiscriminator::FullDistributionFactor; + Probe.Discriminator = 0; return Probe; } } return std::nullopt; } +std::optional +extractProbeFromDiscriminator(const Instruction &Inst) { + assert(isa(&Inst) && !isa(&Inst) && + "Only call instructions should have pseudo probe encodes as their " + "Dwarf discriminators"); + if (const DebugLoc &DLoc = Inst.getDebugLoc()) + return extractProbeFromDiscriminator(DLoc); + return std::nullopt; +} + std::optional extractProbe(const Instruction &Inst) { if (const auto *II = dyn_cast(&Inst)) { PseudoProbe Probe; @@ -54,6 +61,11 @@ Probe.Attr = II->getAttributes()->getZExtValue(); Probe.Factor = II->getFactor()->getZExtValue() / (float)PseudoProbeFullDistributionFactor; + Probe.Discriminator = 0; + if (const DebugLoc &DLoc = Inst.getDebugLoc()) + Probe.Discriminator = DLoc->getDiscriminator(); + assert(Probe.Discriminator == 0 && + "Unexpected non-zero FS-discriminator for IR pseudo probes"); return Probe; } diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -532,7 +532,6 @@ bool runOnFunction(Function &F, ModuleAnalysisManager *AM); bool emitAnnotations(Function &F); ErrorOr getInstWeight(const Instruction &I) override; - ErrorOr getProbeWeight(const Instruction &I); const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const; const FunctionSamples * findFunctionSamples(const Instruction &I) const override; @@ -628,9 +627,6 @@ // External inline advisor used to replay inline decision from remarks. std::unique_ptr ExternalInlineAdvisor; - // A pseudo probe helper to correlate the imported sample counts. - std::unique_ptr ProbeManager; - // A helper to implement the sample profile matching algorithm. std::unique_ptr MatchingManager; @@ -669,68 +665,6 @@ return getInstWeightImpl(Inst); } -// Here use error_code to represent: 1) The dangling probe. 2) Ignore the weight -// of non-probe instruction. So if all instructions of the BB give error_code, -// tell the inference algorithm to infer the BB weight. -ErrorOr SampleProfileLoader::getProbeWeight(const Instruction &Inst) { - assert(FunctionSamples::ProfileIsProbeBased && - "Profile is not pseudo probe based"); - std::optional Probe = extractProbe(Inst); - // Ignore the non-probe instruction. If none of the instruction in the BB is - // probe, we choose to infer the BB's weight. - if (!Probe) - return std::error_code(); - - const FunctionSamples *FS = findFunctionSamples(Inst); - // If none of the instruction has FunctionSample, we choose to return zero - // value sample to indicate the BB is cold. This could happen when the - // instruction is from inlinee and no profile data is found. - // FIXME: This should not be affected by the source drift issue as 1) if the - // newly added function is top-level inliner, it won't match the CFG checksum - // in the function profile or 2) if it's the inlinee, the inlinee should have - // a profile, otherwise it wouldn't be inlined. For non-probe based profile, - // we can improve it by adding a switch for profile-sample-block-accurate for - // block level counts in the future. - if (!FS) - return 0; - - // For non-CS profile, If a direct call/invoke instruction is inlined in - // profile (findCalleeFunctionSamples returns non-empty result), but not - // inlined here, it means that the inlined callsite has no sample, thus the - // call instruction should have 0 count. - // For CS profile, the callsite count of previously inlined callees is - // populated with the entry count of the callees. - if (!FunctionSamples::ProfileIsCS) - if (const auto *CB = dyn_cast(&Inst)) - if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB)) - return 0; - - const ErrorOr &R = FS->findSamplesAt(Probe->Id, 0); - if (R) { - uint64_t Samples = R.get() * Probe->Factor; - bool FirstMark = CoverageTracker.markSamplesUsed(FS, Probe->Id, 0, Samples); - if (FirstMark) { - ORE->emit([&]() { - OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst); - Remark << "Applied " << ore::NV("NumSamples", Samples); - Remark << " samples from profile (ProbeId="; - Remark << ore::NV("ProbeId", Probe->Id); - Remark << ", Factor="; - Remark << ore::NV("Factor", Probe->Factor); - Remark << ", OriginalSamples="; - Remark << ore::NV("OriginalSamples", R.get()); - Remark << ")"; - return Remark; - }); - } - LLVM_DEBUG(dbgs() << " " << Probe->Id << ":" << Inst - << " - weight: " << R.get() << " - factor: " - << format("%0.2f", Probe->Factor) << ")\n"); - return Samples; - } - return R; -} - /// Get the FunctionSamples for a call instruction. /// /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp --- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp @@ -166,47 +166,6 @@ } } -PseudoProbeManager::PseudoProbeManager(const Module &M) { - if (NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName)) { - for (const auto *Operand : FuncInfo->operands()) { - const auto *MD = cast(Operand); - auto GUID = - mdconst::dyn_extract(MD->getOperand(0))->getZExtValue(); - auto Hash = - mdconst::dyn_extract(MD->getOperand(1))->getZExtValue(); - GUIDToProbeDescMap.try_emplace(GUID, PseudoProbeDescriptor(GUID, Hash)); - } - } -} - -const PseudoProbeDescriptor * -PseudoProbeManager::getDesc(const Function &F) const { - auto I = GUIDToProbeDescMap.find( - Function::getGUID(FunctionSamples::getCanonicalFnName(F))); - return I == GUIDToProbeDescMap.end() ? nullptr : &I->second; -} - -bool PseudoProbeManager::moduleIsProbed(const Module &M) const { - return M.getNamedMetadata(PseudoProbeDescMetadataName); -} - -bool PseudoProbeManager::profileIsValid(const Function &F, - const FunctionSamples &Samples) const { - const auto *Desc = getDesc(F); - if (!Desc) { - LLVM_DEBUG(dbgs() << "Probe descriptor missing for Function " << F.getName() - << "\n"); - return false; - } else { - if (Desc->getFunctionHash() != Samples.getFunctionHash()) { - LLVM_DEBUG(dbgs() << "Hash mismatch for Function " << F.getName() - << "\n"); - return false; - } - } - return true; -} - SampleProfileProber::SampleProfileProber(Function &Func, const std::string &CurModuleUniqueId) : F(&Func), CurModuleUniqueId(CurModuleUniqueId) { diff --git a/llvm/test/CodeGen/X86/Inputs/fsloader-probe.afdo b/llvm/test/CodeGen/X86/Inputs/fsloader-probe.afdo new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/Inputs/fsloader-probe.afdo @@ -0,0 +1,40 @@ +foo:884430:431 + 1: 431 + 2: 431 + 2.2048: 19368 + 4: 19332 + 4.512: 24813 + 4.4608: 20867 + 4.9216: 19368 + 4.491520: 24782 + 5: 19332 + 5.2560: 24813 + 5.6144: 20867 + 5.14336: 24782 + 6: 0 + 6.4608: 0 + 6.15872: 26051 + 6.98304: 25893 + 7: 24465 + 7.1024: 25581 + 7.9216: 26128 + 7.11264: 24371 + 8: 0 + 8.11776: 26128 + 8.12288: 25581 + 8.13824: 24371 + 9: 24782 + 9.7168: 19368 + 9.10752: 20867 + 9.14848: 24813 + 10: 24782 + 10.512: 24813 + 10.4608: 20867 + 10.9216: 19368 + 11: 19368 + 12: 19368 + 13: 461 + 14: 98698 bar:98698 + 15: 51957 work:51957 + 16: 76609 work:76609 + !CFGChecksum: 844700110938769 diff --git a/llvm/test/CodeGen/X86/fsafdo_probe2.ll b/llvm/test/CodeGen/X86/fsafdo_probe2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fsafdo_probe2.ll @@ -0,0 +1,322 @@ +; RUN: llvm-profdata merge --sample -profile-isfs --extbinary -o %t.afdo %S/Inputs/fsloader-probe.afdo +; RUN: llc -enable-fs-discriminator -fs-profile-file=%t.afdo -show-fs-branchprob -disable-ra-fsprofile-loader=false -disable-layout-fsprofile-loader=false < %s 2>&1 | FileCheck %s --check-prefix=LOADER +; +;; +;; C source code for the test. +;; Compiled with clang -O3 -g -fdebug-info-for-profiling -fpseudo-probe-for-profiling -mllvm --enable-fs-discriminator +;; // A test case for loop unroll. +;; +;; __attribute__((noinline)) int bar(int i){ +;; volatile int j; +;; j = i; +;; return j; +;; } +;; +;; unsigned sum; +;; __attribute__((noinline)) void work(int i){ +;; if (sum % 7) +;; sum += i; +;; else +;; sum -= i; +;; } +;; +;; __attribute__((noinline)) void foo(){ +;; int i, j; +;; for (j = 0; j < 48; j++) +;; for (i = 0; i < 4; i++) { +;; int ii = bar(i+j*48); +;; if (ii % 2) +;; work(ii*2); +;; if (ii % 4) +;; work(ii*3); +;; } +;; } +;; +;; int main() { +;; int i; +;; for (i = 0; i < 10000000; i++) { +;; foo(); +;; } +;; } +;; +;; + +;; Check that new branch probs are generated. + +; LOADER: Set branch fs prob: MBB (3 -> 5): unroll.c:22:12-->unroll.c:20:12 W=44114 0x30000000 / 0x80000000 = 37.50% --> 0x80000000 / 0x80000000 = 100.00% +; LOADER: Set branch fs prob: MBB (3 -> 4): unroll.c:22:12 W=44114 0x50000000 / 0x80000000 = 62.50% --> 0x00000000 / 0x80000000 = 0.00% +; LOADER: Set branch fs prob: MBB (9 -> 11): unroll.c:20:12-->unroll.c:22:12 W=44114 0x40000000 / 0x80000000 = 50.00% --> 0x80000000 / 0x80000000 = 100.00% +; LOADER: Set branch fs prob: MBB (9 -> 10): unroll.c:20:12 W=44114 0x40000000 / 0x80000000 = 50.00% --> 0x00000000 / 0x80000000 = 0.00% +; LOADER: Set branch fs prob: MBB (1 -> 3): unroll.c:20:12-->unroll.c:22:12 W=26128 0x34de9bd3 / 0x80000000 = 41.30% --> 0x80000000 / 0x80000000 = 100.00% +; LOADER: Set branch fs prob: MBB (1 -> 2): unroll.c:20:12 W=26128 0x4b21642d / 0x80000000 = 58.70% --> 0x00000000 / 0x80000000 = 0.00% +; LOADER: Set branch fs prob: MBB (5 -> 7): unroll.c:20:12-->unroll.c:22:12 W=26128 0x34693ef1 / 0x80000000 = 40.95% --> 0x0060917b / 0x80000000 = 0.29% +; LOADER: Set branch fs prob: MBB (5 -> 6): unroll.c:20:12 W=26128 0x4b96c10f / 0x80000000 = 59.05% --> 0x7f9f6e85 / 0x80000000 = 99.71% +; LOADER: Set branch fs prob: MBB (7 -> 9): unroll.c:22:12-->unroll.c:20:12 W=26128 0x34300cd0 / 0x80000000 = 40.77% --> 0x00000000 / 0x80000000 = 0.00% +; LOADER: Set branch fs prob: MBB (7 -> 8): unroll.c:22:12 W=26128 0x4bcff330 / 0x80000000 = 59.23% --> 0x80000000 / 0x80000000 = 100.00% +; LOADER: Set branch fs prob: MBB (11 -> 13): unroll.c:22:12-->unroll.c:20:12 W=26128 0x35c65cf7 / 0x80000000 = 42.01% --> 0x02ae02d2 / 0x80000000 = 2.09% +; LOADER: Set branch fs prob: MBB (11 -> 12): unroll.c:22:12 W=26128 0x4a39a309 / 0x80000000 = 57.99% --> 0x7d51fd2e / 0x80000000 = 97.91% +; LOADER: Set branch fs prob: MBB (13 -> 15): unroll.c:20:12-->unroll.c:22:12 W=26128 0x34de9bd3 / 0x80000000 = 41.30% --> 0x0126b8ac / 0x80000000 = 0.90% +; LOADER: Set branch fs prob: MBB (13 -> 14): unroll.c:20:12 W=26128 0x4b21642d / 0x80000000 = 58.70% --> 0x7ed94754 / 0x80000000 = 99.10% +; LOADER: Set branch fs prob: MBB (15 -> 17): unroll.c:22:12-->unroll.c:17:4 W=26128 0x3949278b / 0x80000000 = 44.75% --> 0x089b8337 / 0x80000000 = 6.72% +; LOADER: Set branch fs prob: MBB (15 -> 16): unroll.c:22:12 W=26128 0x46b6d875 / 0x80000000 = 55.25% --> 0x77647cc9 / 0x80000000 = 93.28% + + + +target triple = "x86_64-unknown-linux-gnu" + + +@sum = dso_local local_unnamed_addr global i32 0, align 4, !dbg !0 +@__llvm_fs_discriminator__ = weak_odr constant i1 true +@llvm.used = appending global [1 x ptr] [ptr @__llvm_fs_discriminator__], section "llvm.metadata" + +; Function Attrs: nofree noinline nounwind memory(inaccessiblemem: readwrite) uwtable +declare dso_local i32 @bar(i32 noundef %i) local_unnamed_addr #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none) uwtable +declare dso_local void @work(i32 noundef %i) local_unnamed_addr #3 + +; Function Attrs: nofree noinline nounwind uwtable +define dso_local void @foo() local_unnamed_addr #4 !dbg !47 { +entry: + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 1, i32 0, i64 -1), !dbg !59 + call void @llvm.dbg.value(metadata i32 0, metadata !52, metadata !DIExpression()), !dbg !60 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 -1), !dbg !61 + br label %for.cond1.preheader, !dbg !63 + +for.cond1.preheader: ; preds = %entry, %if.end9.3 + %lsr.iv = phi i32 [ 3, %entry ], [ %lsr.iv.next, %if.end9.3 ] + call void @llvm.dbg.value(metadata i32 %lsr.iv, metadata !52, metadata !DIExpression(DW_OP_consts, 3, DW_OP_minus, DW_OP_consts, 48, DW_OP_div, DW_OP_stack_value)), !dbg !60 + call void @llvm.dbg.value(metadata i32 0, metadata !51, metadata !DIExpression()), !dbg !60 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1), !dbg !65 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 5, i32 0, i64 -1), !dbg !67 + %0 = add i32 %lsr.iv, -3, !dbg !65 + call void @llvm.dbg.value(metadata i32 0, metadata !51, metadata !DIExpression()), !dbg !60 + %call = tail call i32 @bar(i32 noundef %0), !dbg !68 + call void @llvm.dbg.value(metadata i32 %call, metadata !53, metadata !DIExpression()), !dbg !70 + %1 = and i32 %call, 1, !dbg !71 + %tobool.not = icmp eq i32 %1, 0, !dbg !71 + br i1 %tobool.not, label %if.end, label %if.then, !dbg !73 + +if.then: ; preds = %for.cond1.preheader + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 6, i32 0, i64 -1), !dbg !74 + %mul4 = shl nsw i32 %call, 1, !dbg !75 + tail call void @work(i32 noundef %mul4), !dbg !76 + br label %if.end, !dbg !78 + +if.end: ; preds = %if.then, %for.cond1.preheader + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 7, i32 0, i64 -1), !dbg !79 + %2 = and i32 %call, 3, !dbg !81 + %tobool6.not = icmp eq i32 %2, 0, !dbg !81 + br i1 %tobool6.not, label %if.end9, label %if.then7, !dbg !82 + +if.then7: ; preds = %if.end + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 8, i32 0, i64 -1), !dbg !83 + %mul8 = mul nsw i32 %call, 3, !dbg !84 + tail call void @work(i32 noundef %mul8), !dbg !85 + br label %if.end9, !dbg !87 + +if.end9: ; preds = %if.then7, %if.end + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 9, i32 0, i64 -1), !dbg !88 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 10, i32 0, i64 -1), !dbg !89 + call void @llvm.dbg.value(metadata i32 1, metadata !51, metadata !DIExpression()), !dbg !60 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1), !dbg !65 + call void @llvm.dbg.value(metadata i32 1, metadata !51, metadata !DIExpression()), !dbg !60 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 5, i32 0, i64 -1), !dbg !67 + %3 = add i32 %lsr.iv, -2, !dbg !68 + %call.1 = tail call i32 @bar(i32 noundef %3), !dbg !68 + call void @llvm.dbg.value(metadata i32 %call.1, metadata !53, metadata !DIExpression()), !dbg !70 + %4 = and i32 %call.1, 1, !dbg !71 + %tobool.not.1 = icmp eq i32 %4, 0, !dbg !71 + br i1 %tobool.not.1, label %if.end.1, label %if.then.1, !dbg !73 + +if.then.1: ; preds = %if.end9 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 6, i32 0, i64 -1), !dbg !74 + %mul4.1 = shl nsw i32 %call.1, 1, !dbg !75 + tail call void @work(i32 noundef %mul4.1), !dbg !76 + br label %if.end.1, !dbg !78 + +if.end.1: ; preds = %if.then.1, %if.end9 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 7, i32 0, i64 -1), !dbg !79 + %5 = and i32 %call.1, 3, !dbg !81 + %tobool6.not.1 = icmp eq i32 %5, 0, !dbg !81 + br i1 %tobool6.not.1, label %if.end9.1, label %if.then7.1, !dbg !82 + +if.then7.1: ; preds = %if.end.1 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 8, i32 0, i64 -1), !dbg !83 + %mul8.1 = mul nsw i32 %call.1, 3, !dbg !84 + tail call void @work(i32 noundef %mul8.1), !dbg !85 + br label %if.end9.1, !dbg !87 + +if.end9.1: ; preds = %if.then7.1, %if.end.1 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 9, i32 0, i64 -1), !dbg !88 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 10, i32 0, i64 -1), !dbg !89 + call void @llvm.dbg.value(metadata i32 2, metadata !51, metadata !DIExpression()), !dbg !60 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1), !dbg !65 + call void @llvm.dbg.value(metadata i32 2, metadata !51, metadata !DIExpression()), !dbg !60 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 5, i32 0, i64 -1), !dbg !67 + %6 = add i32 %lsr.iv, -1, !dbg !68 + %call.2 = tail call i32 @bar(i32 noundef %6), !dbg !68 + call void @llvm.dbg.value(metadata i32 %call.2, metadata !53, metadata !DIExpression()), !dbg !70 + %7 = and i32 %call.2, 1, !dbg !71 + %tobool.not.2 = icmp eq i32 %7, 0, !dbg !71 + br i1 %tobool.not.2, label %if.end.2, label %if.then.2, !dbg !73 + +if.then.2: ; preds = %if.end9.1 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 6, i32 0, i64 -1), !dbg !74 + %mul4.2 = shl nsw i32 %call.2, 1, !dbg !75 + tail call void @work(i32 noundef %mul4.2), !dbg !76 + br label %if.end.2, !dbg !78 + +if.end.2: ; preds = %if.then.2, %if.end9.1 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 7, i32 0, i64 -1), !dbg !79 + %8 = and i32 %call.2, 3, !dbg !81 + %tobool6.not.2 = icmp eq i32 %8, 0, !dbg !81 + br i1 %tobool6.not.2, label %if.end9.2, label %if.then7.2, !dbg !82 + +if.then7.2: ; preds = %if.end.2 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 8, i32 0, i64 -1), !dbg !83 + %mul8.2 = mul nsw i32 %call.2, 3, !dbg !84 + tail call void @work(i32 noundef %mul8.2), !dbg !85 + br label %if.end9.2, !dbg !87 + +if.end9.2: ; preds = %if.then7.2, %if.end.2 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 9, i32 0, i64 -1), !dbg !88 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 10, i32 0, i64 -1), !dbg !89 + call void @llvm.dbg.value(metadata i32 3, metadata !51, metadata !DIExpression()), !dbg !60 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1), !dbg !65 + call void @llvm.dbg.value(metadata i32 3, metadata !51, metadata !DIExpression()), !dbg !60 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 5, i32 0, i64 -1), !dbg !67 + %call.3 = tail call i32 @bar(i32 noundef %lsr.iv), !dbg !68 + call void @llvm.dbg.value(metadata i32 %call.3, metadata !53, metadata !DIExpression()), !dbg !70 + %9 = and i32 %call.3, 1, !dbg !71 + %tobool.not.3 = icmp eq i32 %9, 0, !dbg !71 + br i1 %tobool.not.3, label %if.end.3, label %if.then.3, !dbg !73 + +if.then.3: ; preds = %if.end9.2 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 6, i32 0, i64 -1), !dbg !74 + %mul4.3 = shl nsw i32 %call.3, 1, !dbg !75 + tail call void @work(i32 noundef %mul4.3), !dbg !76 + br label %if.end.3, !dbg !78 + +if.end.3: ; preds = %if.then.3, %if.end9.2 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 7, i32 0, i64 -1), !dbg !79 + %10 = and i32 %call.3, 3, !dbg !81 + %tobool6.not.3 = icmp eq i32 %10, 0, !dbg !81 + br i1 %tobool6.not.3, label %if.end9.3, label %if.then7.3, !dbg !82 + +if.then7.3: ; preds = %if.end.3 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 8, i32 0, i64 -1), !dbg !83 + %mul8.3 = mul nsw i32 %call.3, 3, !dbg !84 + tail call void @work(i32 noundef %mul8.3), !dbg !85 + br label %if.end9.3, !dbg !87 + +if.end9.3: ; preds = %if.then7.3, %if.end.3 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 9, i32 0, i64 -1), !dbg !88 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 10, i32 0, i64 -1), !dbg !89 + call void @llvm.dbg.value(metadata i32 4, metadata !51, metadata !DIExpression()), !dbg !60 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1), !dbg !65 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 11, i32 0, i64 -1), !dbg !90 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 12, i32 0, i64 -1), !dbg !92 + call void @llvm.dbg.value(metadata i32 %lsr.iv, metadata !52, metadata !DIExpression(DW_OP_consts, 3, DW_OP_minus, DW_OP_consts, 48, DW_OP_div, DW_OP_consts, 1, DW_OP_plus, DW_OP_stack_value)), !dbg !60 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 -1), !dbg !61 + %lsr.iv.next = add nuw nsw i32 %lsr.iv, 48, !dbg !93 + %exitcond.not = icmp eq i32 %lsr.iv.next, 2307, !dbg !93 + br i1 %exitcond.not, label %for.end12, label %for.cond1.preheader, !dbg !63, !llvm.loop !95 + +for.end12: ; preds = %if.end9.3 + call void @llvm.pseudoprobe(i64 6699318081062747564, i64 13, i32 0, i64 -1), !dbg !99 + ret void, !dbg !99 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) +declare void @llvm.pseudoprobe(i64, i64, i32, i64) #6 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.value(metadata, metadata, metadata) #7 + +attributes #0 = { nofree noinline nounwind memory(inaccessiblemem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #3 = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { nofree noinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #5 = { nofree nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } +attributes #7 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!6, !7, !8, !9} +!llvm.ident = !{!10} +!llvm.pseudo_probe_desc = !{!11, !12, !13, !14} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "sum", scope: !2, file: !3, line: 7, type: !5, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 17.0.0 (https://github.com/llvm/llvm-project.git fb16df500443aa5129f4a5e4dc4d9dcac613a809)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!3 = !DIFile(filename: "unroll.c", directory: "/home/hoy/build/llvm-github", checksumkind: CSK_MD5, checksum: "11508da575b4d414f8b2f39cf4d90184") +!4 = !{!0} +!5 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) +!6 = !{i32 7, !"Dwarf Version", i32 5} +!7 = !{i32 2, !"Debug Info Version", i32 3} +!8 = !{i32 1, !"wchar_size", i32 4} +!9 = !{i32 7, !"uwtable", i32 2} +!10 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project.git fb16df500443aa5129f4a5e4dc4d9dcac613a809)"} +!11 = !{i64 -2012135647395072713, i64 4294967295, !"bar"} +!12 = !{i64 9204417991963109735, i64 72617220756, !"work"} +!13 = !{i64 6699318081062747564, i64 844700110938769, !"foo"} +!14 = !{i64 -2624081020897602054, i64 281563657672557, !"main"} +!18 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!47 = distinct !DISubprogram(name: "foo", scope: !3, file: !3, line: 15, type: !48, scopeLine: 15, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !50) +!48 = !DISubroutineType(types: !49) +!49 = !{null} +!50 = !{!51, !52, !53} +!51 = !DILocalVariable(name: "i", scope: !47, file: !3, line: 16, type: !18) +!52 = !DILocalVariable(name: "j", scope: !47, file: !3, line: 16, type: !18) +!53 = !DILocalVariable(name: "ii", scope: !54, file: !3, line: 19, type: !18) +!54 = distinct !DILexicalBlock(scope: !55, file: !3, line: 18, column: 30) +!55 = distinct !DILexicalBlock(scope: !56, file: !3, line: 18, column: 6) +!56 = distinct !DILexicalBlock(scope: !57, file: !3, line: 18, column: 6) +!57 = distinct !DILexicalBlock(scope: !58, file: !3, line: 17, column: 4) +!58 = distinct !DILexicalBlock(scope: !47, file: !3, line: 17, column: 4) +!59 = !DILocation(line: 17, column: 11, scope: !58) +!60 = !DILocation(line: 0, scope: !47) +!61 = !DILocation(line: 17, column: 16, scope: !62) +!62 = !DILexicalBlockFile(scope: !57, file: !3, discriminator: 0) +!63 = !DILocation(line: 17, column: 4, scope: !64) +!64 = !DILexicalBlockFile(scope: !58, file: !3, discriminator: 1) +!65 = !DILocation(line: 18, column: 18, scope: !66) +!66 = !DILexicalBlockFile(scope: !55, file: !3, discriminator: 0) +!67 = !DILocation(line: 19, column: 21, scope: !54) +!68 = !DILocation(line: 19, column: 17, scope: !69) +!69 = !DILexicalBlockFile(scope: !54, file: !3, discriminator: 186646647) +!70 = !DILocation(line: 0, scope: !54) +!71 = !DILocation(line: 20, column: 15, scope: !72) +!72 = distinct !DILexicalBlock(scope: !54, file: !3, line: 20, column: 12) +!73 = !DILocation(line: 20, column: 12, scope: !54) +!74 = !DILocation(line: 21, column: 15, scope: !72) +!75 = !DILocation(line: 21, column: 17, scope: !72) +!76 = !DILocation(line: 21, column: 10, scope: !77) +!77 = !DILexicalBlockFile(scope: !72, file: !3, discriminator: 186646655) +!78 = !DILocation(line: 21, column: 10, scope: !72) +!79 = !DILocation(line: 22, column: 12, scope: !80) +!80 = distinct !DILexicalBlock(scope: !54, file: !3, line: 22, column: 12) +!81 = !DILocation(line: 22, column: 15, scope: !80) +!82 = !DILocation(line: 22, column: 12, scope: !54) +!83 = !DILocation(line: 23, column: 15, scope: !80) +!84 = !DILocation(line: 23, column: 17, scope: !80) +!85 = !DILocation(line: 23, column: 10, scope: !86) +!86 = !DILexicalBlockFile(scope: !80, file: !3, discriminator: 186646663) +!87 = !DILocation(line: 23, column: 10, scope: !80) +!88 = !DILocation(line: 24, column: 4, scope: !54) +!89 = !DILocation(line: 18, column: 26, scope: !66) +!90 = !DILocation(line: 24, column: 4, scope: !91) +!91 = !DILexicalBlockFile(scope: !56, file: !3, discriminator: 0) +!92 = !DILocation(line: 17, column: 25, scope: !62) +!93 = !DILocation(line: 17, column: 18, scope: !94) +!94 = !DILexicalBlockFile(scope: !57, file: !3, discriminator: 1) +!95 = distinct !{!95, !96, !97, !98} +!96 = !DILocation(line: 17, column: 4, scope: !58) +!97 = !DILocation(line: 24, column: 4, scope: !58) +!98 = !{!"llvm.loop.mustprogress"} +!99 = !DILocation(line: 25, column: 2, scope: !47) diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll --- a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll @@ -1,6 +1,6 @@ ; REQUIRES: x86_64-linux ; REQUIRES: asserts -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-matching.prof --salvage-stale-profile -S --debug-only=sample-profile 2>&1 | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-matching.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-impl 2>&1 | FileCheck %s ; The profiled source code: