diff --git a/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript b/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript --- a/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript +++ b/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript @@ -1,4 +1,3 @@ -Using perf wrapper that supports hot-text. Try perf.real if you encounter any issues. PERF_RECORD_MMAP2 2854748/2854748: [0x400000(0x1000) @ 0 00:1d 123291722 526021]: r-xp /home/inline-cs-noprobe.perfbin diff --git a/llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfscript b/llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfscript new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfscript @@ -0,0 +1,5 @@ +PERF_RECORD_MMAP2 595196/595196: [0x201000(0x1000) @ 0 00:1d 224227621 1042948]: r-xp /home/inline-cs-pseudoprobe.perfbin + + 20180e + 5541f689495641d7 + 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 0x20182b/0x201800/M/-/-/0 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 0x201858/0x20180e/P/-/-/0 diff --git a/llvm/test/tools/llvm-profgen/Inputs/noinline-cs-noprobe.perfscript b/llvm/test/tools/llvm-profgen/Inputs/noinline-cs-noprobe.perfscript --- a/llvm/test/tools/llvm-profgen/Inputs/noinline-cs-noprobe.perfscript +++ b/llvm/test/tools/llvm-profgen/Inputs/noinline-cs-noprobe.perfscript @@ -1,4 +1,3 @@ -Using perf wrapper that supports hot-text. Try perf.real if you encounter any issues. PERF_RECORD_MMAP2 2854748/2854748: [0x400000(0x1000) @ 0 00:1d 123291722 526021]: r-xp /home/noinline-cs-noprobe.perfbin 4005dc diff --git a/llvm/test/tools/llvm-profgen/Inputs/noinline-cs-pseudoprobe.perfbin b/llvm/test/tools/llvm-profgen/Inputs/noinline-cs-pseudoprobe.perfbin new file mode 100755 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ + +int bar(int x, int y) { + if (x % 3) { + return x - y; + } + return x + y; +} + +void foo() { + int s, i = 0; + while (i++ < 4000 * 4000) + if (i % 91) s = bar(i, s); else s += 30; + printf("sum is %d\n", s); +} + +int main() { + foo(); + return 0; +} diff --git a/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test b/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test @@ -0,0 +1,41 @@ +; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-pseudoprobe.perfscript --binary=%S/Inputs/noinline-cs-pseudoprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER + +; CHECK-UNWINDER: Binary(noinline-cs-pseudoprobe.perfbin)'s Range Counter: +; CHECK-UNWINDER-NEXT: main:2 +; CHECK-UNWINDER-NEXT: (79e, 7bf): 15 +; CHECK-UNWINDER-NEXT: (7c4, 7cf): 15 +; CHECK-UNWINDER-NEXT: main:2 @ foo:8 +; CHECK-UNWINDER-NEXT: (760, 77f): 15 + +; CHECK-UNWINDER: Binary(noinline-cs-pseudoprobe.perfbin)'s Branch Counter: +; CHECK-UNWINDER-NEXT: main:2 +; CHECK-UNWINDER-NEXT: (7bf, 760): 15 +; CHECK-UNWINDER-NEXT: (7cf, 79e): 16 +; CHECK-UNWINDER-NEXT: main:2 @ foo:8 +; CHECK-UNWINDER-NEXT: (77f, 7c4): 17 + + +; clang -O3 -fexperimental-new-pass-manager -fuse-ld=lld -fpseudo-probe-for-profiling +; -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -Xclang -mdisable-tail-calls +; -fno-inline-functions -g test.c -o a.out + +#include + +int bar(int x, int y) { + if (x % 3) { + return x - y; + } + return x + y; +} + +void foo() { + int s, i = 0; + while (i++ < 4000 * 4000) + if (i % 91) s = bar(i, s); else s += 30; + printf("sum is %d\n", s); +} + +int main() { + foo(); + return 0; +} diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h --- a/llvm/tools/llvm-profgen/PerfReader.h +++ b/llvm/tools/llvm-profgen/PerfReader.h @@ -253,7 +253,7 @@ }; // Utilities for LLVM-style RTTI - enum ContextKind { CK_StringBased }; + enum ContextKind { CK_StringBased, CK_ProbeBased }; const ContextKind Kind; ContextKind getKind() const { return Kind; } ContextKey(ContextKind K) : Kind(K){}; @@ -275,6 +275,37 @@ void genHashCode() { HashCode = hash_value(Context); } }; +// Probe based context key as the intermediate key of context +// String based context key will introduce redundant string handling +// since the callee context is inferred from the context string which +// need to be splitted by '@' to get the last location frame, so we +// can just use probe instead and generate the string in the end. +struct ProbeBasedCtxKey : public ContextKey { + SmallVector Probes; + + ProbeBasedCtxKey() : ContextKey(CK_ProbeBased) {} + static bool classof(const ContextKey *K) { + return K->getKind() == CK_ProbeBased; + } + + bool isEqual(const ContextKey *K) const override { + const ProbeBasedCtxKey *O = dyn_cast(K); + assert(O != nullptr && "Probe based key shouldn't be null in isEqual"); + return std::equal(Probes.begin(), Probes.end(), O->Probes.begin(), + O->Probes.end()); + } + + void genHashCode() { + for (const auto *P : Probes) { + HashCode = hash_combine(HashCode, P); + } + if (HashCode == 0) { + // Avoid zero value of HashCode when it's an empty list + HashCode = 1; + } + } +}; + // The counter of branch samples for one function indexed by the branch, // which is represented as the source and target offset pair. using BranchSample = std::map, uint64_t>; @@ -343,8 +374,21 @@ uint64_t Repeat); void recordBranchCount(const LBREntry &Branch, UnwindState &State, uint64_t Repeat); - SampleCounter &getOrCreateSampleCounter(const ProfiledBinary *Binary, - std::list &CallStack); + SampleCounter &getOrCreateCounter(const ProfiledBinary *Binary, + std::list &CallStack); + // Use pseudo probe based context key to get the sample counter + // A context stands for a call path from 'main' to an uninlined + // callee with all inline frames recovered on that path. The probes + // belonging to that call path is the probes either originated from + // the callee or from any functions inlined into the callee. Since + // pseudo probes are organized in a tri-tree style after decoded, + // the tree path from the tri-tree root (which is the uninlined + // callee) to the probe node forms an inline context. + // Here we use a list of probe(pointer) as the context key to speed up + // aggregation and the final context string will be generate in + // ProfileGenerator + SampleCounter &getOrCreateCounterForProbe(const ProfiledBinary *Binary, + std::list &CallStack); private: ContextSampleCounterMap *CtxCounterMap; diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp --- a/llvm/tools/llvm-profgen/PerfReader.cpp +++ b/llvm/tools/llvm-profgen/PerfReader.cpp @@ -40,18 +40,27 @@ InstructionPointer &IP = State.InstPtr; uint64_t Target = State.getCurrentLBRTarget(); uint64_t End = IP.Address; - // Unwind linear execution part - while (IP.Address >= Target) { - uint64_t PrevIP = IP.Address; - IP.backward(); - // Break into segments for implicit call/return due to inlining - bool SameInlinee = - State.getBinary()->inlineContextEqual(PrevIP, IP.Address); - if (!SameInlinee || PrevIP == Target) { - recordRangeCount(PrevIP, End, State, Repeat); - End = IP.Address; + if (State.getBinary()->usePseudoProbes()) { + // The outcome of the virtual unwinding with pseudo probes is a + // map from a context key to the address range being unwound. + // This means basically linear unwinding is not needed for pseudo + // probes. The range will be simply recorded here and will be + // converted to a list of pseudo probes to report in ProfileGenerator. + recordRangeCount(Target, End, State, Repeat); + } else { + // Unwind linear execution part + while (IP.Address >= Target) { + uint64_t PrevIP = IP.Address; + IP.backward(); + // Break into segments for implicit call/return due to inlining + bool SameInlinee = + State.getBinary()->inlineContextEqual(PrevIP, IP.Address); + if (!SameInlinee || PrevIP == Target) { + recordRangeCount(PrevIP, End, State, Repeat); + End = IP.Address; + } + State.CallStack.front() = IP.Address; } - State.CallStack.front() = IP.Address; } } @@ -73,8 +82,11 @@ } SampleCounter & -VirtualUnwinder::getOrCreateSampleCounter(const ProfiledBinary *Binary, - std::list &CallStack) { +VirtualUnwinder::getOrCreateCounter(const ProfiledBinary *Binary, + std::list &CallStack) { + if (Binary->usePseudoProbes()) { + return getOrCreateCounterForProbe(Binary, CallStack); + } std::shared_ptr KeyStr = std::make_shared(); KeyStr->Context = Binary->getExpandedContextStr(CallStack); @@ -84,12 +96,46 @@ return Ret.first->second; } +SampleCounter & +VirtualUnwinder::getOrCreateCounterForProbe(const ProfiledBinary *Binary, + std::list &CallStack) { + std::shared_ptr ProbeBasedKey = + std::make_shared(); + if (CallStack.size() > 1) { + // We don't need to top frame probe since it should be extracted + // from the range. + // The top of stack is an instruction from the function where + // the LBR address range physcially resides. Strip it since + // the function is not a part of the call context. We also + // don't need its inline context since the probes being unwound + // come with an inline context all the way back to the uninlined + // function in their prefix tree. + auto Iter = CallStack.rbegin(); + auto EndT = std::prev(CallStack.rend()); + for (; Iter != EndT; Iter++) { + uint64_t Address = *Iter; + const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Address); + // We may not find a probe for a merged or external callsite. + // Callsite merging may cause the loss of original probe IDs. + // Cutting off the context from here since the inline will + // not know how to consume a context with unknown callsites. + if (!CallProbe) + break; + ProbeBasedKey->Probes.emplace_back(CallProbe); + } + } + ProbeBasedKey->genHashCode(); + Hashable ContextId(ProbeBasedKey); + auto Ret = CtxCounterMap->emplace(ContextId, SampleCounter()); + return Ret.first->second; +} + void VirtualUnwinder::recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State, uint64_t Repeat) { uint64_t StartOffset = State.getBinary()->virtualAddrToOffset(Start); uint64_t EndOffset = State.getBinary()->virtualAddrToOffset(End); SampleCounter &SCounter = - getOrCreateSampleCounter(State.getBinary(), State.CallStack); + getOrCreateCounter(State.getBinary(), State.CallStack); SCounter.recordRangeCount(StartOffset, EndOffset, Repeat); } @@ -100,7 +146,7 @@ uint64_t SourceOffset = State.getBinary()->virtualAddrToOffset(Branch.Source); uint64_t TargetOffset = State.getBinary()->virtualAddrToOffset(Branch.Target); SampleCounter &SCounter = - getOrCreateSampleCounter(State.getBinary(), State.CallStack); + getOrCreateCounter(State.getBinary(), State.CallStack); SCounter.recordBranchCount(SourceOffset, TargetOffset, Repeat); } @@ -212,7 +258,7 @@ } // Use ordered map to make the output deterministic -using OrderedCounterForPrint = std::map; +using OrderedCounterForPrint = std::map; static void printSampleCounter(OrderedCounterForPrint &OrderedCounter) { for (auto Range : OrderedCounter) { @@ -224,22 +270,41 @@ } } -static void printRangeCounter(ContextSampleCounterMap &Counter) { +static std::string getContextKeyStr(ContextKey *K, + const ProfiledBinary *Binary) { + std::string ContextStr; + if (const auto *CtxKey = dyn_cast(K)) { + return CtxKey->Context; + } else if (const auto *CtxKey = dyn_cast(K)) { + SmallVector ContextStack; + for (const auto *Probe : CtxKey->Probes) { + Binary->getInlineContextForProbe(Probe, ContextStack, true); + } + for (const auto &Context : ContextStack) { + if (ContextStr.size()) + ContextStr += " @ "; + ContextStr += Context; + } + } + return ContextStr; +} + +static void printRangeCounter(ContextSampleCounterMap &Counter, + const ProfiledBinary *Binary) { OrderedCounterForPrint OrderedCounter; for (auto &CI : Counter) { - const StringBasedCtxKey *CtxKey = - dyn_cast(CI.first.getPtr()); - OrderedCounter[CtxKey->Context] = CI.second.RangeCounter; + OrderedCounter[getContextKeyStr(CI.first.getPtr(), Binary)] = + CI.second.RangeCounter; } printSampleCounter(OrderedCounter); } -static void printBranchCounter(ContextSampleCounterMap &Counter) { +static void printBranchCounter(ContextSampleCounterMap &Counter, + const ProfiledBinary *Binary) { OrderedCounterForPrint OrderedCounter; for (auto &CI : Counter) { - const StringBasedCtxKey *CtxKey = - dyn_cast(CI.first.getPtr()); - OrderedCounter[CtxKey->Context] = CI.second.BranchCounter; + OrderedCounter[getContextKeyStr(CI.first.getPtr(), Binary)] = + CI.second.BranchCounter; } printSampleCounter(OrderedCounter); } @@ -248,9 +313,9 @@ for (auto I : BinarySampleCounters) { const ProfiledBinary *Binary = I.first; outs() << "Binary(" << Binary->getName().str() << ")'s Range Counter:\n"; - printRangeCounter(I.second); + printRangeCounter(I.second, Binary); outs() << "\nBinary(" << Binary->getName().str() << ")'s Branch Counter:\n"; - printBranchCounter(I.second); + printBranchCounter(I.second, Binary); } } diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h --- a/llvm/tools/llvm-profgen/ProfileGenerator.h +++ b/llvm/tools/llvm-profgen/ProfileGenerator.h @@ -56,6 +56,7 @@ }; class CSProfileGenerator : public ProfileGenerator { +protected: const BinarySampleCounterMap &BinarySampleCounters; public: @@ -107,6 +108,16 @@ void populateInferredFunctionSamples(); }; +class PseudoProbeCSProfileGenerator : public CSProfileGenerator { + +public: + PseudoProbeCSProfileGenerator(const BinarySampleCounterMap &Counters) + : CSProfileGenerator(Counters) {} + void generateProfile() override { + // TODO + } +}; + } // end namespace sampleprof } // end namespace llvm diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp --- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp +++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp @@ -28,13 +28,23 @@ namespace llvm { namespace sampleprof { +static bool +usePseudoProbes(const BinarySampleCounterMap &BinarySampleCounters) { + return BinarySampleCounters.size() && + BinarySampleCounters.begin()->first->usePseudoProbes(); +} + std::unique_ptr ProfileGenerator::create(const BinarySampleCounterMap &BinarySampleCounters, enum PerfScriptType SampleType) { std::unique_ptr ProfileGenerator; - if (SampleType == PERF_LBR_STACK) { - ProfileGenerator.reset(new CSProfileGenerator(BinarySampleCounters)); + if (usePseudoProbes(BinarySampleCounters)) { + ProfileGenerator.reset( + new PseudoProbeCSProfileGenerator(BinarySampleCounters)); + } else { + ProfileGenerator.reset(new CSProfileGenerator(BinarySampleCounters)); + } } else { // TODO: llvm_unreachable("Unsupported perfscript!"); diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h --- a/llvm/tools/llvm-profgen/ProfiledBinary.h +++ b/llvm/tools/llvm-profgen/ProfiledBinary.h @@ -133,6 +133,8 @@ // Pseudo probe decoder PseudoProbeDecoder ProbeDecoder; + bool UsePseudoProbes = false; + void setPreferredBaseAddress(const ELFObjectFileBase *O); void decodePseudoProbe(const ELFObjectFileBase *Obj); @@ -204,6 +206,7 @@ return offsetToVirtualAddr(CodeAddrs[Index]); } + bool usePseudoProbes() const { return UsePseudoProbes; } // Get the index in CodeAddrs for the address // As we might get an address which is not the code // here it would round to the next valid code address by @@ -234,6 +237,17 @@ // It will search the disassembling info stored in Offset2LocStackMap. This is // used as the key of function sample map std::string getExpandedContextStr(const std::list &stack) const; + + const PseudoProbe *getCallProbeForAddr(uint64_t Address) const { + return ProbeDecoder.getCallProbeForAddr(Address); + } + void + getInlineContextForProbe(const PseudoProbe *Probe, + SmallVector &InlineContextStack, + bool IncludeLeaf) const { + return ProbeDecoder.getInlineContextForProbe(Probe, InlineContextStack, + IncludeLeaf); + } }; } // end namespace sampleprof diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp --- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp +++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp @@ -156,7 +156,6 @@ OContextStr << ContextVec[I]; } } - return OContextStr.str(); } @@ -187,6 +186,8 @@ StringRef Contents = unwrapOrError(Section.getContents(), FileName); ProbeDecoder.buildAddress2ProbeMap( reinterpret_cast(Contents.data()), Contents.size()); + // set UsePseudoProbes flag, used for PerfReader + UsePseudoProbes = true; } } diff --git a/llvm/tools/llvm-profgen/PseudoProbe.h b/llvm/tools/llvm-profgen/PseudoProbe.h --- a/llvm/tools/llvm-profgen/PseudoProbe.h +++ b/llvm/tools/llvm-profgen/PseudoProbe.h @@ -201,6 +201,21 @@ // Print pseudo_probe section info, used along with show-disassembly void printProbeForAddress(raw_ostream &OS, uint64_t Address); + + // Look up the probe of a call for the input address + const PseudoProbe *getCallProbeForAddr(uint64_t Address) const; + + // Helper function to populate one probe's inline stack into + // \p InlineContextStack. + // Current leaf location info will be added if IncludeLeaf is true + // Example: + // Current probe(bar:3) inlined at foo:2 then inlined at main:1 + // IncludeLeaf = true, Output: [main:1, foo:2, bar:3] + // IncludeLeaf = false, OUtput: [main:1, foo:2] + void + getInlineContextForProbe(const PseudoProbe *Probe, + SmallVector &InlineContextStack, + bool IncludeLeaf) const; }; } // end namespace sampleprof diff --git a/llvm/tools/llvm-profgen/PseudoProbe.cpp b/llvm/tools/llvm-profgen/PseudoProbe.cpp --- a/llvm/tools/llvm-profgen/PseudoProbe.cpp +++ b/llvm/tools/llvm-profgen/PseudoProbe.cpp @@ -293,5 +293,42 @@ } } +const PseudoProbe * +PseudoProbeDecoder::getCallProbeForAddr(uint64_t Address) const { + auto It = Address2ProbesMap.find(Address); + if (It == Address2ProbesMap.end()) + return nullptr; + const std::vector &Probes = It->second; + + const PseudoProbe *CallProbe = nullptr; + for (const auto &Probe : Probes) { + if (Probe.isCall()) { + assert(!CallProbe && + "There should be only one call probe corresponding to address " + "which is a callsite."); + CallProbe = &Probe; + } + } + return CallProbe; +} + +void PseudoProbeDecoder::getInlineContextForProbe( + const PseudoProbe *Probe, SmallVector &InlineContextStack, + bool IncludeLeaf) const { + if (IncludeLeaf) { + // Note that the context from probe doesn't include leaf frame, + // hence we need to retrieve and prepend leaf if requested. + auto It = GUID2FuncDescMap.find(Probe->GUID); + assert(It != GUID2FuncDescMap.end() && + "Should have function descriptor for a valid GUID"); + StringRef FuncName = It->second.FuncName; + // InlineContextStack is in callee-caller order, so push leaf in the front + InlineContextStack.emplace_back(FuncName.str() + ":" + + Twine(Probe->Index).str()); + } + + Probe->getInlineContext(InlineContextStack, GUID2FuncDescMap, true); +} + } // end namespace sampleprof } // end namespace llvm