diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h --- a/llvm/tools/llvm-profgen/PerfReader.h +++ b/llvm/tools/llvm-profgen/PerfReader.h @@ -133,7 +133,7 @@ // Profiled binary that current frame address belongs to ProfiledBinary *Binary; // Call stack recorded in FILO(leaf to root) order - std::list CallStack; + SmallVector CallStack; // LBR stack recorded in FIFO order SmallVector LBRStack; @@ -147,7 +147,7 @@ const HybridSample *Other = dyn_cast(K); if (Other->Binary != Binary) return false; - const std::list &OtherCallStack = Other->CallStack; + const SmallVector &OtherCallStack = Other->CallStack; const SmallVector &OtherLBRStack = Other->LBRStack; if (CallStack.size() != OtherCallStack.size() || @@ -193,14 +193,40 @@ std::unordered_map, uint64_t, Hashable::Hash, Hashable::Equal>; +using SampleVector = SmallVector, 16>; // The state for the unwinder, it doesn't hold the data but only keep the // pointer/index of the data, While unwinding, the CallStack is changed // dynamicially and will be recorded as the context of the sample struct UnwindState { // Profiled binary that current frame address belongs to const ProfiledBinary *Binary; - // TODO: switch to use trie for call stack - std::list CallStack; + // Call stack trie node + struct ProfiledFrame { + const uint64_t Address = 0; + ProfiledFrame *Parent; + SampleVector RangeSamples; + SampleVector BranchSamples; + std::unordered_map> Children; + + ProfiledFrame(uint64_t Addr = 0, ProfiledFrame *P = nullptr) + : Address(Addr), Parent(P) {} + ProfiledFrame *getOrCreateChildFrame(uint64_t Address) { + assert(Address && "Address can't be zero!"); + auto Ret = Children.emplace( + Address, std::make_unique(Address, this)); + return Ret.first->second.get(); + } + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) { + RangeSamples.emplace_back(std::make_tuple(Start, End, Count)); + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) { + BranchSamples.emplace_back(std::make_tuple(Source, Target, Count)); + } + bool isDummyRoot() { return Address == 0; } + }; + + ProfiledFrame DummyTrieRoot; + ProfiledFrame *FrameCurrentLeaf; // Used to fall through the LBR stack uint32_t LBRIndex = 0; // Reference to HybridSample.LBRStack @@ -208,19 +234,20 @@ // Used to iterate the address range InstructionPointer InstPtr; UnwindState(const HybridSample *Sample) - : Binary(Sample->Binary), CallStack(Sample->CallStack), - LBRStack(Sample->LBRStack), - InstPtr(Sample->Binary, Sample->CallStack.front()) {} + : Binary(Sample->Binary), LBRStack(Sample->LBRStack), + InstPtr(Sample->Binary, Sample->CallStack.front()) { + initFrameTrie(Sample->CallStack); + } bool validateInitialState() { uint64_t LBRLeaf = LBRStack[LBRIndex].Target; - uint64_t StackLeaf = CallStack.front(); + uint64_t LeafAddr = FrameCurrentLeaf->Address; // When we take a stack sample, ideally the sampling distance between the // leaf IP of stack and the last LBR target shouldn't be very large. // Use a heuristic size (0x100) to filter out broken records. - if (StackLeaf < LBRLeaf || StackLeaf >= LBRLeaf + 0x100) { + if (LeafAddr < LBRLeaf || LeafAddr >= LBRLeaf + 0x100) { WithColor::warning() << "Bogus trace: stack tip = " - << format("%#010x", StackLeaf) + << format("%#010x", LeafAddr) << ", LBR tip = " << format("%#010x\n", LBRLeaf); return false; } @@ -228,19 +255,40 @@ } void checkStateConsistency() { - assert(InstPtr.Address == CallStack.front() && + assert(InstPtr.Address == FrameCurrentLeaf->Address && "IP should align with context leaf"); } - std::string getExpandedContextStr() const { - return Binary->getExpandedContextStr(CallStack); - } const ProfiledBinary *getBinary() const { return Binary; } bool hasNextLBR() const { return LBRIndex < LBRStack.size(); } uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; } uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; } const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; } void advanceLBR() { LBRIndex++; } + + ProfiledFrame *getParentFrame() { return FrameCurrentLeaf->Parent; } + + void pushFrame(uint64_t Address) { + FrameCurrentLeaf = FrameCurrentLeaf->getOrCreateChildFrame(Address); + } + + void switchToFrame(uint64_t Address) { + if (FrameCurrentLeaf->Address == Address) + return; + FrameCurrentLeaf = FrameCurrentLeaf->Parent->getOrCreateChildFrame(Address); + } + + void popFrame() { FrameCurrentLeaf = FrameCurrentLeaf->Parent; } + + void initFrameTrie(const SmallVectorImpl &CallStack) { + ProfiledFrame *Cur = &DummyTrieRoot; + for (auto Address : reverse(CallStack)) { + Cur = Cur->getOrCreateChildFrame(Address); + } + FrameCurrentLeaf = Cur; + } + + ProfiledFrame *getDummyRootPtr() { return &DummyTrieRoot; } }; // Base class for sample counter key with context @@ -330,6 +378,56 @@ std::unordered_map, SampleCounter, Hashable::Hash, Hashable::Equal>; +struct FrameStack { + SmallVector Stack; + const ProfiledBinary *Binary; + FrameStack(const ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + Stack.push_back(Cur->Address); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + std::shared_ptr getContextKey(); +}; + +struct ProbeStack { + SmallVector Stack; + const ProfiledBinary *Binary; + ProbeStack(const ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Cur->Address); + // We may not find a probe for a merged or external callsite. + // Callsite merging may cause the loss of original probe IDs. + // Cutting off the context from here since the inliner will + // not know how to consume a context with unknown callsites. + if (!CallProbe) + return false; + Stack.push_back(CallProbe); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + // Use pseudo probe based context key to get the sample counter + // A context stands for a call path from 'main' to an uninlined + // callee with all inline frames recovered on that path. The probes + // belonging to that call path is the probes either originated from + // the callee or from any functions inlined into the callee. Since + // pseudo probes are organized in a tri-tree style after decoded, + // the tree path from the tri-tree root (which is the uninlined + // callee) to the probe node forms an inline context. + // Here we use a list of probe(pointer) as the context key to speed up + // aggregation and the final context string will be generate in + // ProfileGenerator + std::shared_ptr getContextKey(); +}; + /* As in hybrid sample we have a group of LBRs and the most recent sampling call stack, we can walk through those LBRs to infer more call stacks which would be @@ -351,47 +449,42 @@ */ class VirtualUnwinder { public: - VirtualUnwinder(ContextSampleCounterMap *Counter) : CtxCounterMap(Counter) {} + VirtualUnwinder(ContextSampleCounterMap *Counter, const ProfiledBinary *B) + : CtxCounterMap(Counter), Binary(B) {} + bool unwind(const HybridSample *Sample, uint64_t Repeat); +private: bool isCallState(UnwindState &State) const { // The tail call frame is always missing here in stack sample, we will // use a specific tail call tracker to infer it. - return State.getBinary()->addressIsCall(State.getCurrentLBRSource()); + return Binary->addressIsCall(State.getCurrentLBRSource()); } bool isReturnState(UnwindState &State) const { // Simply check addressIsReturn, as ret is always reliable, both for // regular call and tail call. - return State.getBinary()->addressIsReturn(State.getCurrentLBRSource()); + return Binary->addressIsReturn(State.getCurrentLBRSource()); } void unwindCall(UnwindState &State); void unwindLinear(UnwindState &State, uint64_t Repeat); void unwindReturn(UnwindState &State); void unwindBranchWithinFrame(UnwindState &State); - bool unwind(const HybridSample *Sample, uint64_t Repeat); + template + void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack); + // Collect each samples on trie node by DFS traversal + template + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur, T &Stack); + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur); + void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State, uint64_t Repeat); void recordBranchCount(const LBREntry &Branch, UnwindState &State, uint64_t Repeat); - SampleCounter &getOrCreateCounter(const ProfiledBinary *Binary, - std::list &CallStack); - // Use pseudo probe based context key to get the sample counter - // A context stands for a call path from 'main' to an uninlined - // callee with all inline frames recovered on that path. The probes - // belonging to that call path is the probes either originated from - // the callee or from any functions inlined into the callee. Since - // pseudo probes are organized in a tri-tree style after decoded, - // the tree path from the tri-tree root (which is the uninlined - // callee) to the probe node forms an inline context. - // Here we use a list of probe(pointer) as the context key to speed up - // aggregation and the final context string will be generate in - // ProfileGenerator - SampleCounter &getOrCreateCounterForProbe(const ProfiledBinary *Binary, - std::list &CallStack); - private: ContextSampleCounterMap *CtxCounterMap; + // Profiled binary that current frame address belongs to + const ProfiledBinary *Binary; }; // Filename to binary map @@ -457,10 +550,11 @@ // Parse the hybrid sample including the call and LBR line void parseHybridSample(TraceStream &TraceIt); // Extract call stack from the perf trace lines - bool extractCallstack(TraceStream &TraceIt, std::list &CallStack); + bool extractCallstack(TraceStream &TraceIt, + SmallVectorImpl &CallStack); // Extract LBR stack from one perf trace line bool extractLBRStack(TraceStream &TraceIt, - SmallVector &LBRStack, + SmallVectorImpl &LBRStack, ProfiledBinary *Binary); void checkAndSetPerfType(cl::list &PerfTraceFilenames); // Post process the profile after trace aggregation, we will do simple range diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp --- a/llvm/tools/llvm-profgen/PerfReader.cpp +++ b/llvm/tools/llvm-profgen/PerfReader.cpp @@ -28,11 +28,11 @@ // 2nd frame is in prolog/epilog. In the future, we will switch to // pro/epi tracker(Dwarf CFI) for the precise check. uint64_t Source = State.getCurrentLBRSource(); - auto Iter = State.CallStack.begin(); - if (State.CallStack.size() == 1 || *(++Iter) != Source) { - State.CallStack.front() = Source; + auto *PrevFrame = State.getParentFrame(); + if (PrevFrame == State.getDummyRootPtr() || PrevFrame->Address != Source) { + State.switchToFrame(Source); } else { - State.CallStack.pop_front(); + State.popFrame(); } State.InstPtr.update(Source); } @@ -41,26 +41,29 @@ InstructionPointer &IP = State.InstPtr; uint64_t Target = State.getCurrentLBRTarget(); uint64_t End = IP.Address; - if (State.getBinary()->usePseudoProbes()) { + if (Binary->usePseudoProbes()) { + // We don't need to top frame probe since it should be extracted + // from the range. // The outcome of the virtual unwinding with pseudo probes is a // map from a context key to the address range being unwound. // This means basically linear unwinding is not needed for pseudo // probes. The range will be simply recorded here and will be // converted to a list of pseudo probes to report in ProfileGenerator. - recordRangeCount(Target, End, State, Repeat); + State.FrameCurrentLeaf->Parent->recordRangeCount(Target, End, Repeat); } else { // Unwind linear execution part + uint64_t LeafAddr = State.FrameCurrentLeaf->Address; while (IP.Address >= Target) { uint64_t PrevIP = IP.Address; IP.backward(); // Break into segments for implicit call/return due to inlining - bool SameInlinee = - State.getBinary()->inlineContextEqual(PrevIP, IP.Address); + bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address); if (!SameInlinee || PrevIP == Target) { - recordRangeCount(PrevIP, End, State, Repeat); + State.switchToFrame(LeafAddr); + State.FrameCurrentLeaf->recordRangeCount(PrevIP, End, Repeat); End = IP.Address; } - State.CallStack.front() = IP.Address; + LeafAddr = IP.Address; } } } @@ -68,9 +71,9 @@ void VirtualUnwinder::unwindReturn(UnwindState &State) { // Add extra frame as we unwind through the return const LBREntry &LBR = State.getCurrentLBR(); - uint64_t CallAddr = State.getBinary()->getCallAddrFromFrameAddr(LBR.Target); - State.CallStack.front() = CallAddr; - State.CallStack.push_front(LBR.Source); + uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target); + State.switchToFrame(CallAddr); + State.pushFrame(LBR.Source); State.InstPtr.update(LBR.Source); } @@ -78,79 +81,100 @@ // TODO: Tolerate tail call for now, as we may see tail call from libraries. // This is only for intra function branches, excluding tail calls. uint64_t Source = State.getCurrentLBRSource(); - State.CallStack.front() = Source; + State.switchToFrame(Source); State.InstPtr.update(Source); } -SampleCounter & -VirtualUnwinder::getOrCreateCounter(const ProfiledBinary *Binary, - std::list &CallStack) { - if (Binary->usePseudoProbes()) { - return getOrCreateCounterForProbe(Binary, CallStack); - } +std::shared_ptr FrameStack::getContextKey() { std::shared_ptr KeyStr = std::make_shared(); - KeyStr->Context = Binary->getExpandedContextStr(CallStack); + KeyStr->Context = Binary->getExpandedContextStr(Stack); KeyStr->genHashCode(); - auto Ret = - CtxCounterMap->emplace(Hashable(KeyStr), SampleCounter()); - return Ret.first->second; + return KeyStr; } -SampleCounter & -VirtualUnwinder::getOrCreateCounterForProbe(const ProfiledBinary *Binary, - std::list &CallStack) { +std::shared_ptr ProbeStack::getContextKey() { std::shared_ptr ProbeBasedKey = std::make_shared(); - if (CallStack.size() > 1) { - // We don't need to top frame probe since it should be extracted - // from the range. - // The top of stack is an instruction from the function where - // the LBR address range physcially resides. Strip it since - // the function is not a part of the call context. We also - // don't need its inline context since the probes being unwound - // come with an inline context all the way back to the uninlined - // function in their prefix tree. - auto Iter = CallStack.rbegin(); - auto EndT = std::prev(CallStack.rend()); - for (; Iter != EndT; Iter++) { - uint64_t Address = *Iter; - const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Address); - // We may not find a probe for a merged or external callsite. - // Callsite merging may cause the loss of original probe IDs. - // Cutting off the context from here since the inline will - // not know how to consume a context with unknown callsites. - if (!CallProbe) - break; - ProbeBasedKey->Probes.emplace_back(CallProbe); - } + for (auto CallProbe : Stack) { + ProbeBasedKey->Probes.emplace_back(CallProbe); } CSProfileGenerator::compressRecursionContext( ProbeBasedKey->Probes); ProbeBasedKey->genHashCode(); - Hashable ContextId(ProbeBasedKey); - auto Ret = CtxCounterMap->emplace(ContextId, SampleCounter()); - return Ret.first->second; + return ProbeBasedKey; +} + +template +void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, + T &Stack) { + if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty()) + return; + + auto Ret = CtxCounterMap->emplace(Hashable(Stack.getContextKey()), + SampleCounter()); + SampleCounter &SCounter = Ret.first->second; + for (auto &Item : Cur->RangeSamples) { + uint64_t StartOffset = Binary->virtualAddrToOffset(std::get<0>(Item)); + uint64_t EndOffset = Binary->virtualAddrToOffset(std::get<1>(Item)); + SCounter.recordRangeCount(StartOffset, EndOffset, std::get<2>(Item)); + } + + for (auto &Item : Cur->BranchSamples) { + uint64_t SourceOffset = Binary->virtualAddrToOffset(std::get<0>(Item)); + uint64_t TargetOffset = Binary->virtualAddrToOffset(std::get<1>(Item)); + SCounter.recordBranchCount(SourceOffset, TargetOffset, std::get<2>(Item)); + } +} + +template +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur, T &Stack) { + if (!Cur->isDummyRoot()) { + if (!Stack.pushFrame(Cur)) { + // Process truncated context + for (const auto &Item : Cur->Children) { + // Start a new traversal ignoring its bottom context + collectSamplesFromFrameTrie(Item.second.get()); + } + return; + } + } + + collectSamplesFromFrame(Cur, Stack); + // Process children frame + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), Stack); + } + // Recover the call stack + Stack.popFrame(); } -void VirtualUnwinder::recordRangeCount(uint64_t Start, uint64_t End, - UnwindState &State, uint64_t Repeat) { - uint64_t StartOffset = State.getBinary()->virtualAddrToOffset(Start); - uint64_t EndOffset = State.getBinary()->virtualAddrToOffset(End); - SampleCounter &SCounter = - getOrCreateCounter(State.getBinary(), State.CallStack); - SCounter.recordRangeCount(StartOffset, EndOffset, Repeat); +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur) { + if (Binary->usePseudoProbes()) { + ProbeStack Stack(Binary); + collectSamplesFromFrameTrie(Cur, Stack); + } else { + FrameStack Stack(Binary); + collectSamplesFromFrameTrie(Cur, Stack); + } } void VirtualUnwinder::recordBranchCount(const LBREntry &Branch, UnwindState &State, uint64_t Repeat) { if (Branch.IsArtificial) return; - uint64_t SourceOffset = State.getBinary()->virtualAddrToOffset(Branch.Source); - uint64_t TargetOffset = State.getBinary()->virtualAddrToOffset(Branch.Target); - SampleCounter &SCounter = - getOrCreateCounter(State.getBinary(), State.CallStack); - SCounter.recordBranchCount(SourceOffset, TargetOffset, Repeat); + + if (Binary->usePseudoProbes()) { + // Same as recordRangeCount, We don't need to top frame probe since we will + // extract it from branch's source address + State.FrameCurrentLeaf->Parent->recordBranchCount(Branch.Source, + Branch.Target, Repeat); + } else { + State.FrameCurrentLeaf->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } } bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) { @@ -199,6 +223,8 @@ // Record `branch` with calling context after unwinding. recordBranchCount(Branch, State, Repeat); } + // As samples are aggregated on trie, record them into counter map + collectSamplesFromFrameTrie(State.getDummyRootPtr()); return true; } @@ -325,7 +351,8 @@ void PerfReader::unwindSamples() { for (const auto &Item : AggregatedSamples) { const HybridSample *Sample = dyn_cast(Item.first.getPtr()); - VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary]); + VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary], + Sample->Binary); Unwinder.unwind(Sample, Item.second); } @@ -334,7 +361,7 @@ } bool PerfReader::extractLBRStack(TraceStream &TraceIt, - SmallVector &LBRStack, + SmallVectorImpl &LBRStack, ProfiledBinary *Binary) { // The raw format of LBR stack is like: // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... @@ -398,7 +425,7 @@ } bool PerfReader::extractCallstack(TraceStream &TraceIt, - std::list &CallStack) { + SmallVectorImpl &CallStack) { // The raw format of call stack is like: // 4005dc # leaf frame // 400634 diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h --- a/llvm/tools/llvm-profgen/ProfiledBinary.h +++ b/llvm/tools/llvm-profgen/ProfiledBinary.h @@ -236,7 +236,8 @@ // Get the context string of the current stack with inline context filled in. // It will search the disassembling info stored in Offset2LocStackMap. This is // used as the key of function sample map - std::string getExpandedContextStr(const std::list &stack) const; + std::string + getExpandedContextStr(const SmallVectorImpl &Stack) const; const PseudoProbe *getCallProbeForAddr(uint64_t Address) const { return ProbeDecoder.getCallProbeForAddr(Address); diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp --- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp +++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp @@ -126,13 +126,13 @@ Context2.begin(), Context2.begin() + Context2.size() - 1); } -std::string -ProfiledBinary::getExpandedContextStr(const std::list &Stack) const { +std::string ProfiledBinary::getExpandedContextStr( + const SmallVectorImpl &Stack) const { std::string ContextStr; SmallVector ContextVec; // Process from frame root to leaf - for (auto Iter = Stack.rbegin(); Iter != Stack.rend(); Iter++) { - uint64_t Offset = virtualAddrToOffset(*Iter); + for (auto Address : Stack) { + uint64_t Offset = virtualAddrToOffset(Address); const FrameLocationStack &ExpandedContext = getFrameLocationStack(Offset); for (const auto &Loc : ExpandedContext) { ContextVec.push_back(getCallSite(Loc));