diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h --- a/llvm/tools/llvm-profgen/PerfReader.h +++ b/llvm/tools/llvm-profgen/PerfReader.h @@ -133,7 +133,7 @@ // Profiled binary that current frame address belongs to ProfiledBinary *Binary; // Call stack recorded in FILO(leaf to root) order - std::list CallStack; + SmallVector CallStack; // LBR stack recorded in FIFO order SmallVector LBRStack; @@ -147,7 +147,7 @@ const HybridSample *Other = dyn_cast(K); if (Other->Binary != Binary) return false; - const std::list &OtherCallStack = Other->CallStack; + const SmallVector &OtherCallStack = Other->CallStack; const SmallVector &OtherLBRStack = Other->LBRStack; if (CallStack.size() != OtherCallStack.size() || @@ -193,14 +193,38 @@ std::unordered_map, uint64_t, Hashable::Hash, Hashable::Equal>; +using SampleVector = SmallVector, 16>; // The state for the unwinder, it doesn't hold the data but only keep the // pointer/index of the data, While unwinding, the CallStack is changed // dynamicially and will be recorded as the context of the sample struct UnwindState { // Profiled binary that current frame address belongs to const ProfiledBinary *Binary; - // TODO: switch to use trie for call stack - std::list CallStack; + // Call stack trie node + struct Frame { + const uint64_t Address = 0; + Frame *Parent; + SampleVector RangeSamples; + SampleVector BranchSamples; + std::unordered_map> Children; + + Frame() {} + Frame(uint64_t Addr) : Address(Addr) {} + Frame *getOrCreateFrame(uint64_t Address) { + auto Ret = Children.emplace(Address, std::make_unique(Address)); + Ret.first->second->Parent = this; + return Ret.first->second.get(); + } + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) { + RangeSamples.emplace_back(std::make_tuple(Start, End, Count)); + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) { + BranchSamples.emplace_back(std::make_tuple(Source, Target, Count)); + } + }; + + Frame FrameTrieRoot; + Frame *StackLeaf; // Used to fall through the LBR stack uint32_t LBRIndex = 0; // Reference to HybridSample.LBRStack @@ -208,19 +232,20 @@ // Used to iterate the address range InstructionPointer InstPtr; UnwindState(const HybridSample *Sample) - : Binary(Sample->Binary), CallStack(Sample->CallStack), - LBRStack(Sample->LBRStack), - InstPtr(Sample->Binary, Sample->CallStack.front()) {} + : Binary(Sample->Binary), LBRStack(Sample->LBRStack), + InstPtr(Sample->Binary, Sample->CallStack.front()) { + initCallStackTrie(Sample->CallStack); + } bool validateInitialState() { uint64_t LBRLeaf = LBRStack[LBRIndex].Target; - uint64_t StackLeaf = CallStack.front(); + uint64_t LeafAddr = StackLeaf->Address; // When we take a stack sample, ideally the sampling distance between the // leaf IP of stack and the last LBR target shouldn't be very large. // Use a heuristic size (0x100) to filter out broken records. - if (StackLeaf < LBRLeaf || StackLeaf >= LBRLeaf + 0x100) { + if (LeafAddr < LBRLeaf || LeafAddr >= LBRLeaf + 0x100) { WithColor::warning() << "Bogus trace: stack tip = " - << format("%#010x", StackLeaf) + << format("%#010x", LeafAddr) << ", LBR tip = " << format("%#010x\n", LBRLeaf); return false; } @@ -228,19 +253,36 @@ } void checkStateConsistency() { - assert(InstPtr.Address == CallStack.front() && + assert(InstPtr.Address == StackLeaf->Address && "IP should align with context leaf"); } - std::string getExpandedContextStr() const { - return Binary->getExpandedContextStr(CallStack); - } const ProfiledBinary *getBinary() const { return Binary; } bool hasNextLBR() const { return LBRIndex < LBRStack.size(); } uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; } uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; } const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; } void advanceLBR() { LBRIndex++; } + + void appendFrame(uint64_t Address) { + StackLeaf = StackLeaf->getOrCreateFrame(Address); + } + + void switchToLeaf(uint64_t Address) { + if (StackLeaf->Address == Address) + return; + StackLeaf = StackLeaf->Parent->getOrCreateFrame(Address); + } + + void popStackLeaf() { StackLeaf = StackLeaf->Parent; } + + void initCallStackTrie(const SmallVectorImpl &CallStack) { + Frame *Cur = &FrameTrieRoot; + for (auto Address : reverse(CallStack)) { + Cur = Cur->getOrCreateFrame(Address); + } + StackLeaf = Cur; + } }; // Base class for sample counter key with context @@ -350,31 +392,36 @@ */ class VirtualUnwinder { public: - VirtualUnwinder(ContextSampleCounterMap *Counter) : CtxCounterMap(Counter) {} + VirtualUnwinder(ContextSampleCounterMap *Counter, const ProfiledBinary *B) + : CtxCounterMap(Counter), Binary(B) {} bool isCallState(UnwindState &State) const { // The tail call frame is always missing here in stack sample, we will // use a specific tail call tracker to infer it. - return State.getBinary()->addressIsCall(State.getCurrentLBRSource()); + return Binary->addressIsCall(State.getCurrentLBRSource()); } bool isReturnState(UnwindState &State) const { // Simply check addressIsReturn, as ret is always reliable, both for // regular call and tail call. - return State.getBinary()->addressIsReturn(State.getCurrentLBRSource()); + return Binary->addressIsReturn(State.getCurrentLBRSource()); } void unwindCall(UnwindState &State); void unwindLinear(UnwindState &State, uint64_t Repeat); void unwindReturn(UnwindState &State); void unwindBranchWithinFrame(UnwindState &State); + void recordSampleWithinFrame(UnwindState::Frame *Cur, + SmallVectorImpl &CallStack); + // Use DFS traversal to record each samples on trie node + void recordSampleOnTrie(UnwindState::Frame *Cur, + SmallVectorImpl &CallStack); bool unwind(const HybridSample *Sample, uint64_t Repeat); void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State, uint64_t Repeat); void recordBranchCount(const LBREntry &Branch, UnwindState &State, uint64_t Repeat); - SampleCounter &getOrCreateCounter(const ProfiledBinary *Binary, - std::list &CallStack); + SampleCounter &getOrCreateCounter(SmallVectorImpl &CallStack); // Use pseudo probe based context key to get the sample counter // A context stands for a call path from 'main' to an uninlined // callee with all inline frames recovered on that path. The probes @@ -386,11 +433,13 @@ // Here we use a list of probe(pointer) as the context key to speed up // aggregation and the final context string will be generate in // ProfileGenerator - SampleCounter &getOrCreateCounterForProbe(const ProfiledBinary *Binary, - std::list &CallStack); + SampleCounter & + getOrCreateCounterForProbe(SmallVectorImpl &CallStack); private: ContextSampleCounterMap *CtxCounterMap; + // Profiled binary that current frame address belongs to + const ProfiledBinary *Binary; }; // Filename to binary map @@ -456,10 +505,11 @@ // Parse the hybrid sample including the call and LBR line void parseHybridSample(TraceStream &TraceIt); // Extract call stack from the perf trace lines - bool extractCallstack(TraceStream &TraceIt, std::list &CallStack); + bool extractCallstack(TraceStream &TraceIt, + SmallVectorImpl &CallStack); // Extract LBR stack from one perf trace line bool extractLBRStack(TraceStream &TraceIt, - SmallVector &LBRStack, + SmallVectorImpl &LBRStack, ProfiledBinary *Binary); void checkAndSetPerfType(cl::list &PerfTraceFilenames); // Post process the profile after trace aggregation, we will do simple range diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp --- a/llvm/tools/llvm-profgen/PerfReader.cpp +++ b/llvm/tools/llvm-profgen/PerfReader.cpp @@ -28,11 +28,11 @@ // 2nd frame is in prolog/epilog. In the future, we will switch to // pro/epi tracker(Dwarf CFI) for the precise check. uint64_t Source = State.getCurrentLBRSource(); - auto Iter = State.CallStack.begin(); - if (State.CallStack.size() == 1 || *(++Iter) != Source) { - State.CallStack.front() = Source; + auto PrevFrame = State.StackLeaf->Parent; + if (PrevFrame == &State.FrameTrieRoot || PrevFrame->Address != Source) { + State.switchToLeaf(Source); } else { - State.CallStack.pop_front(); + State.popStackLeaf(); } State.InstPtr.update(Source); } @@ -41,26 +41,27 @@ InstructionPointer &IP = State.InstPtr; uint64_t Target = State.getCurrentLBRTarget(); uint64_t End = IP.Address; - if (State.getBinary()->usePseudoProbes()) { + if (Binary->usePseudoProbes()) { // The outcome of the virtual unwinding with pseudo probes is a // map from a context key to the address range being unwound. // This means basically linear unwinding is not needed for pseudo // probes. The range will be simply recorded here and will be // converted to a list of pseudo probes to report in ProfileGenerator. - recordRangeCount(Target, End, State, Repeat); + State.StackLeaf->Parent->recordRangeCount(Target, End, Repeat); } else { // Unwind linear execution part + uint64_t LeafAddr = State.StackLeaf->Address; while (IP.Address >= Target) { uint64_t PrevIP = IP.Address; IP.backward(); // Break into segments for implicit call/return due to inlining - bool SameInlinee = - State.getBinary()->inlineContextEqual(PrevIP, IP.Address); + bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address); if (!SameInlinee || PrevIP == Target) { - recordRangeCount(PrevIP, End, State, Repeat); + State.switchToLeaf(LeafAddr); + State.StackLeaf->recordRangeCount(PrevIP, End, Repeat); End = IP.Address; } - State.CallStack.front() = IP.Address; + LeafAddr = IP.Address; } } } @@ -68,9 +69,9 @@ void VirtualUnwinder::unwindReturn(UnwindState &State) { // Add extra frame as we unwind through the return const LBREntry &LBR = State.getCurrentLBR(); - uint64_t CallAddr = State.getBinary()->getCallAddrFromFrameAddr(LBR.Target); - State.CallStack.front() = CallAddr; - State.CallStack.push_front(LBR.Source); + uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target); + State.switchToLeaf(CallAddr); + State.appendFrame(LBR.Source); State.InstPtr.update(LBR.Source); } @@ -78,15 +79,67 @@ // TODO: Tolerate tail call for now, as we may see tail call from libraries. // This is only for intra function branches, excluding tail calls. uint64_t Source = State.getCurrentLBRSource(); - State.CallStack.front() = Source; + State.switchToLeaf(Source); State.InstPtr.update(Source); } +void VirtualUnwinder::recordSampleWithinFrame( + UnwindState::Frame *Cur, SmallVectorImpl &CallStack) { + if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty()) + return; + SampleCounter &SCounter = getOrCreateCounter(CallStack); + for (auto &Item : Cur->RangeSamples) { + uint64_t StartOffset = Binary->virtualAddrToOffset(std::get<0>(Item)); + uint64_t EndOffset = Binary->virtualAddrToOffset(std::get<1>(Item)); + SCounter.recordRangeCount(StartOffset, EndOffset, std::get<2>(Item)); + } + + for (auto &Item : Cur->BranchSamples) { + uint64_t SourceOffset = Binary->virtualAddrToOffset(std::get<0>(Item)); + uint64_t TargetOffset = Binary->virtualAddrToOffset(std::get<1>(Item)); + SCounter.recordBranchCount(SourceOffset, TargetOffset, std::get<2>(Item)); + } +} + +void VirtualUnwinder::recordSampleOnTrie(UnwindState::Frame *Cur, + SmallVectorImpl &CallStack) { + if (Cur->Address != 0) { + if (Binary->usePseudoProbes()) { + const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Cur->Address); + // We may not find a probe for a merged or external callsite. + // Callsite merging may cause the loss of original probe IDs. + // Cutting off the context from here since the inline will + // not know how to consume a context with unknown callsites. + if (!CallProbe) { + for (const auto &Item : Cur->Children) { + // Start a new traversal ignoring its bottom context + SmallVector NewCallStack; + recordSampleOnTrie(Item.second.get(), NewCallStack); + } + return; + } else { + CallStack.push_back(reinterpret_cast(CallProbe)); + } + } else { + CallStack.push_back(Cur->Address); + } + } + + recordSampleWithinFrame(Cur, CallStack); + + for (const auto &Item : Cur->Children) { + recordSampleOnTrie(Item.second.get(), CallStack); + } + + // Recover the call stack + if (!CallStack.empty()) + CallStack.pop_back(); +} + SampleCounter & -VirtualUnwinder::getOrCreateCounter(const ProfiledBinary *Binary, - std::list &CallStack) { +VirtualUnwinder::getOrCreateCounter(SmallVectorImpl &CallStack) { if (Binary->usePseudoProbes()) { - return getOrCreateCounterForProbe(Binary, CallStack); + return getOrCreateCounterForProbe(CallStack); } std::shared_ptr KeyStr = std::make_shared(); @@ -97,33 +150,13 @@ return Ret.first->second; } -SampleCounter & -VirtualUnwinder::getOrCreateCounterForProbe(const ProfiledBinary *Binary, - std::list &CallStack) { +SampleCounter &VirtualUnwinder::getOrCreateCounterForProbe( + SmallVectorImpl &CallStack) { std::shared_ptr ProbeBasedKey = std::make_shared(); - if (CallStack.size() > 1) { - // We don't need to top frame probe since it should be extracted - // from the range. - // The top of stack is an instruction from the function where - // the LBR address range physcially resides. Strip it since - // the function is not a part of the call context. We also - // don't need its inline context since the probes being unwound - // come with an inline context all the way back to the uninlined - // function in their prefix tree. - auto Iter = CallStack.rbegin(); - auto EndT = std::prev(CallStack.rend()); - for (; Iter != EndT; Iter++) { - uint64_t Address = *Iter; - const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Address); - // We may not find a probe for a merged or external callsite. - // Callsite merging may cause the loss of original probe IDs. - // Cutting off the context from here since the inline will - // not know how to consume a context with unknown callsites. - if (!CallProbe) - break; - ProbeBasedKey->Probes.emplace_back(CallProbe); - } + for (auto CallProbe : CallStack) { + ProbeBasedKey->Probes.emplace_back( + reinterpret_cast(CallProbe)); } ProfileGenerator::compressRecursionContext( ProbeBasedKey->Probes); @@ -133,24 +166,17 @@ return Ret.first->second; } -void VirtualUnwinder::recordRangeCount(uint64_t Start, uint64_t End, - UnwindState &State, uint64_t Repeat) { - uint64_t StartOffset = State.getBinary()->virtualAddrToOffset(Start); - uint64_t EndOffset = State.getBinary()->virtualAddrToOffset(End); - SampleCounter &SCounter = - getOrCreateCounter(State.getBinary(), State.CallStack); - SCounter.recordRangeCount(StartOffset, EndOffset, Repeat); -} - void VirtualUnwinder::recordBranchCount(const LBREntry &Branch, UnwindState &State, uint64_t Repeat) { if (Branch.IsArtificial) return; - uint64_t SourceOffset = State.getBinary()->virtualAddrToOffset(Branch.Source); - uint64_t TargetOffset = State.getBinary()->virtualAddrToOffset(Branch.Target); - SampleCounter &SCounter = - getOrCreateCounter(State.getBinary(), State.CallStack); - SCounter.recordBranchCount(SourceOffset, TargetOffset, Repeat); + + if (Binary->usePseudoProbes()) { + State.StackLeaf->Parent->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } else { + State.StackLeaf->recordBranchCount(Branch.Source, Branch.Target, Repeat); + } } bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) { @@ -199,6 +225,9 @@ // Record `branch` with calling context after unwinding. recordBranchCount(Branch, State, Repeat); } + // As samples are aggregated on trie, record them into counter map + SmallVector Context; + recordSampleOnTrie(&State.FrameTrieRoot, Context); return true; } @@ -325,7 +354,8 @@ void PerfReader::unwindSamples() { for (const auto &Item : AggregatedSamples) { const HybridSample *Sample = dyn_cast(Item.first.getPtr()); - VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary]); + VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary], + Sample->Binary); Unwinder.unwind(Sample, Item.second); } @@ -334,7 +364,7 @@ } bool PerfReader::extractLBRStack(TraceStream &TraceIt, - SmallVector &LBRStack, + SmallVectorImpl &LBRStack, ProfiledBinary *Binary) { // The raw format of LBR stack is like: // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... @@ -398,7 +428,7 @@ } bool PerfReader::extractCallstack(TraceStream &TraceIt, - std::list &CallStack) { + SmallVectorImpl &CallStack) { // The raw format of call stack is like: // 4005dc # leaf frame // 400634 diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h --- a/llvm/tools/llvm-profgen/ProfiledBinary.h +++ b/llvm/tools/llvm-profgen/ProfiledBinary.h @@ -236,7 +236,8 @@ // Get the context string of the current stack with inline context filled in. // It will search the disassembling info stored in Offset2LocStackMap. This is // used as the key of function sample map - std::string getExpandedContextStr(const std::list &stack) const; + std::string + getExpandedContextStr(const SmallVectorImpl &stack) const; const PseudoProbe *getCallProbeForAddr(uint64_t Address) const { return ProbeDecoder.getCallProbeForAddr(Address); diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp --- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp +++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp @@ -128,13 +128,13 @@ Context2.begin(), Context2.begin() + Context2.size() - 1); } -std::string -ProfiledBinary::getExpandedContextStr(const std::list &Stack) const { +std::string ProfiledBinary::getExpandedContextStr( + const SmallVectorImpl &Stack) const { std::string ContextStr; SmallVector ContextVec; // Process from frame root to leaf - for (auto Iter = Stack.rbegin(); Iter != Stack.rend(); Iter++) { - uint64_t Offset = virtualAddrToOffset(*Iter); + for (auto Address : Stack) { + uint64_t Offset = virtualAddrToOffset(Address); const FrameLocationStack &ExpandedContext = getFrameLocationStack(Offset); for (const auto &Loc : ExpandedContext) { ContextVec.push_back(getCallSite(Loc));