diff --git a/llvm/test/tools/llvm-profgen/Inputs/callback-external-addr.perfbin b/llvm/test/tools/llvm-profgen/Inputs/callback-external-addr.perfbin new file mode 100755 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@6d0:5 +; CHECK-UNWINDER: [foo:2 @ qux] +; CHECK-UNWINDER: 4 +; CHECK-UNWINDER: 6d0-6e7:5 +; CHECK-UNWINDER: 6ec-710:6 +; CHECK-UNWINDER: 715-71b:7 +; CHECK-UNWINDER: 720-72b:6 +; CHECK-UNWINDER: 3 +; CHECK-UNWINDER: 6e7->6b0:6 +; CHECK-UNWINDER: 71b->6c0:7 +; CHECK-UNWINDER: 72b->74c:6 +; CHECK-UNWINDER: [foo:2 @ qux:2 @ callBeforeReturn] +; CHECK-UNWINDER: 1 +; CHECK-UNWINDER: 6b0-6be:6 +; CHECK-UNWINDER: 1 +; CHECK-UNWINDER: 6be->6ec:7 +; CHECK-UNWINDER: [foo:2 @ qux:4 @ callAfterReturn] +; CHECK-UNWINDER: 1 +; CHECK-UNWINDER: 6c0-6ce:7 +; CHECK-UNWINDER: 1 +; CHECK-UNWINDER: 6ce->720:7 +; CHECK-UNWINDER: [main] +; CHECK-UNWINDER: 2 +; CHECK-UNWINDER: 77d-7ab:5 +; CHECK-UNWINDER: 7b0-7bf:5 +; CHECK-UNWINDER: 1 +; CHECK-UNWINDER: 7bf->77d:5 + +; libcallback.c +; clang -shared -fPIC -o libcallback.so libcallback.c + +int callback(int *cnt, int (*func1)(int), int (*func2)(int), int p) { + (*cnt)++; + return func1(p) + func2(p); +} + +; test.c +; clang test.c -O0 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling -L $PWD -lcallback -fno-inline + +#include + +int callbackCnt = 0; + +int callback(int *cnt, int (*func1)(int), int (*func2)(int), int p); + +int bar(int p) { + return p + 1; +} + +int baz(int p) { + return p - 1; +} + +int callBeforeReturn(int p) { + return p + 10; +} + +int callAfterReturn(int p) { + return p - 10; +} + +int qux(int p) { + p += 10; + int ret = callBeforeReturn(p); + ret = callback(&callbackCnt, bar, baz, ret); + ret = callAfterReturn(ret); + return ret; +} + +int foo (int p) { + p -= 10; + return qux(p); +} + +int main(void) { + int sum = 0; + for (int i = 0; i < 1000 * 1000; i++) { + sum += callback(&callbackCnt, foo, bar, i); + } + printf("callback count=%d, sum=%d\n", callbackCnt, sum); +} diff --git a/llvm/test/tools/llvm-profgen/inline-noprobe2.test b/llvm/test/tools/llvm-profgen/inline-noprobe2.test --- a/llvm/test/tools/llvm-profgen/inline-noprobe2.test +++ b/llvm/test/tools/llvm-profgen/inline-noprobe2.test @@ -8,8 +8,11 @@ ; RUN: llvm-profgen --format=extbinary --perfscript=%S/Inputs/inline-noprobe2.perfscript --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t --populate-profile-symbol-list=1 ; RUN: llvm-profdata show -show-prof-sym-list -sample %t | FileCheck %s --check-prefix=CHECK-SYM-LIST -; CHECK-ARTIFICIAL-BRANCH: 0 -; CHECK-ARTIFICIAL-BRANCH: 0 +; CHECK-ARTIFICIAL-BRANCH: 2 +; CHECK-ARTIFICIAL-BRANCH: 400870-400870:2 +; CHECK-ARTIFICIAL-BRANCH: 400875-4008bf:1 +; CHECK-ARTIFICIAL-BRANCH: 1 +; CHECK-ARTIFICIAL-BRANCH: 4008bf->400870:2 ; CHECK-SYM-LIST: Dump profile symbol list ; CHECK-SYM-LIST: main diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h --- a/llvm/tools/llvm-profgen/PerfReader.h +++ b/llvm/tools/llvm-profgen/PerfReader.h @@ -214,14 +214,6 @@ using SampleVector = SmallVector, 16>; -// The special frame addresses. -enum SpecialFrameAddr { - // Dummy root of frame trie. - DummyRoot = 0, - // Represent all the addresses outside of current binary. - ExternalAddr = 1, -}; - // The state for the unwinder, it doesn't hold the data but only keep the // pointer/index of the data, While unwinding, the CallStack is changed // dynamicially and will be recorded as the context of the sample @@ -313,6 +305,8 @@ void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; } + void clearCallStack() { CurrentLeafFrame = &DummyTrieRoot; } + void initFrameTrie(const SmallVectorImpl &CallStack) { ProfiledFrame *Cur = &DummyTrieRoot; for (auto Address : reverse(CallStack)) { @@ -426,10 +420,8 @@ ProfiledBinary *Binary; FrameStack(ProfiledBinary *B) : Binary(B) {} bool pushFrame(UnwindState::ProfiledFrame *Cur) { - // Truncate the context for external frame since this isn't a real call - // context the compiler will see - if (Cur->isExternalFrame()) - return false; + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); Stack.push_back(Cur->Address); return true; } @@ -446,10 +438,8 @@ ProfiledBinary *Binary; ProbeStack(ProfiledBinary *B) : Binary(B) {} bool pushFrame(UnwindState::ProfiledFrame *Cur) { - // Truncate the context for external frame since this isn't a real call - // context the compiler will see - if (Cur->isExternalFrame()) - return false; + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); const MCDecodedPseudoProbe *CallProbe = Binary->getCallProbeForAddr(Cur->Address); // We may not find a probe for a merged or external callsite. @@ -506,6 +496,12 @@ bool unwind(const PerfSample *Sample, uint64_t Repeat); std::set &getUntrackedCallsites() { return UntrackedCallsites; } + uint64_t NumTotalBranches = 0; + uint64_t NumExtCallBranch = 0; + uint64_t NumMissingExternalFrame = 0; + uint64_t NumMismatchedProEpiBranch = 0; + uint64_t NumMismatchedExtCallBranch = 0; + private: bool isCallState(UnwindState &State) const { // The tail call frame is always missing here in stack sample, we will @@ -516,7 +512,19 @@ bool isReturnState(UnwindState &State) const { // Simply check addressIsReturn, as ret is always reliable, both for // regular call and tail call. - return Binary->addressIsReturn(State.getCurrentLBRSource()); + if (!Binary->addressIsReturn(State.getCurrentLBRSource())) + return false; + + // In a callback case, a return from internal code, say A, to external + // runtime can happen. The external runtime can then call back to + // another internal routine, say B. Making an artificial branch that + // looks like a return from A to B can confuse the unwinder to treat + // the instruction before B as the call instruction. Here we detect this + // case if the return target is not the next inst of call inst, then we just + // do not treat it as a return. + uint64_t CallAddr = + Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()); + return (CallAddr != 0); } void unwindCall(UnwindState &State); diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp --- a/llvm/tools/llvm-profgen/PerfReader.cpp +++ b/llvm/tools/llvm-profgen/PerfReader.cpp @@ -51,18 +51,44 @@ namespace sampleprof { void VirtualUnwinder::unwindCall(UnwindState &State) { + uint64_t Source = State.getCurrentLBRSource(); + // An artificial return should push an external frame and an artificial call + // will match it and pop the external frame so that the context before and + // after the external call will be the same. + if (State.getCurrentLBR().IsArtificial) { + NumExtCallBranch++; + // A return is matched and pop the external frame. + if (State.getParentFrame()->isExternalFrame()) { + State.popFrame(); + } else { + // An artificial return is missing, it happens that the sample is just hit + // in the middle of the external code. In this case, the leading branch is + // a call to external, we just keep unwinding use a context-less stack. + if (State.getParentFrame() != State.getDummyRootPtr()) + NumMissingExternalFrame++; + State.clearCallStack(); + State.pushFrame(Source); + State.InstPtr.update(Source); + return; + } + } + + auto *ParentFrame = State.getParentFrame(); // The 2nd frame after leaf could be missing if stack sample is // taken when IP is within prolog/epilog, as frame chain isn't // setup yet. Fill in the missing frame in that case. // TODO: Currently we just assume all the addr that can't match the // 2nd frame is in prolog/epilog. In the future, we will switch to // pro/epi tracker(Dwarf CFI) for the precise check. - uint64_t Source = State.getCurrentLBRSource(); - auto *ParentFrame = State.getParentFrame(); - if (ParentFrame == State.getDummyRootPtr() || ParentFrame->Address != Source) { State.switchToFrame(Source); + if (ParentFrame != State.getDummyRootPtr()) { + if (State.getCurrentLBR().IsArtificial) + NumMismatchedExtCallBranch++; + else + NumMismatchedProEpiBranch++; + } } else { State.popFrame(); } @@ -118,6 +144,19 @@ const LBREntry &LBR = State.getCurrentLBR(); uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target); State.switchToFrame(CallAddr); + // Push an external frame for the case of returning to external + // address(callback), later if an aitificial call is matched and it will be + // popped up. This is to 1)avoid context being interrupted by callback, + // context before or after the callback should be the same. 2) the call stack + // of function called by callback should be truncated which is done during + // recording the context on trie. For example: + // main (call)--> foo (call)--> callback (call)--> bar (return)--> callback + // (return)--> foo (return)--> main + // Context for bar should not include main and foo. + // For the code of foo, the context of before and after callback should both + // be [foo, main]. + if (LBR.IsArtificial) + State.pushFrame(ExternalAddr); State.pushFrame(LBR.Source); State.InstPtr.update(LBR.Source); } @@ -180,7 +219,9 @@ void VirtualUnwinder::collectSamplesFromFrameTrie( UnwindState::ProfiledFrame *Cur, T &Stack) { if (!Cur->isDummyRoot()) { - if (!Stack.pushFrame(Cur)) { + // Truncate the context for external frame since this isn't a real call + // context the compiler will see. + if (Cur->isExternalFrame() || !Stack.pushFrame(Cur)) { // Process truncated context // Start a new traversal ignoring its bottom context T EmptyStack(Binary); @@ -453,6 +494,21 @@ SampleCounters.size(), "of profiled contexts are truncated due to missing probe " "for call instruction."); + + emitWarningSummary( + Unwinder.NumMismatchedExtCallBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to unwinding error of external frame."); + + emitWarningSummary( + Unwinder.NumMismatchedProEpiBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to frame in prolog/epilog."); + + emitWarningSummary(Unwinder.NumMissingExternalFrame, + Unwinder.NumExtCallBranch, + "of artificial call branches but doesn't have an external " + "frame to match."); } bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt, @@ -538,15 +594,6 @@ break; } - if (Binary->addressIsReturn(Src)) { - // In a callback case, a return from internal code, say A, to external - // runtime can happen. The external runtime can then call back to - // another internal routine, say B. Making an artificial branch that - // looks like a return from A to B can confuse the unwinder to treat - // the instruction before B as the call instruction. - break; - } - // For transition to external code, group the Source with the next // availabe transition target. Dst = PrevTrDst; @@ -854,10 +901,15 @@ SampleCounter &Counter = SampleCounters.begin()->second; uint64_t EndOffeset = 0; for (const LBREntry &LBR : Sample->LBRStack) { + assert(LBR.Source != ExternalAddr && + "Branch' source should not be an external address, it should be " + "converted to aritificial branch."); uint64_t SourceOffset = Binary->virtualAddrToOffset(LBR.Source); - uint64_t TargetOffset = Binary->virtualAddrToOffset(LBR.Target); + uint64_t TargetOffset = LBR.Target == ExternalAddr + ? ExternalAddr + : Binary->virtualAddrToOffset(LBR.Target); - if (!LBR.IsArtificial) { + if (!LBR.IsArtificial && TargetOffset != ExternalAddr) { Counter.recordBranchCount(SourceOffset, TargetOffset, Repeat); } diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h --- a/llvm/tools/llvm-profgen/ProfiledBinary.h +++ b/llvm/tools/llvm-profgen/ProfiledBinary.h @@ -70,6 +70,16 @@ void update(uint64_t Addr); }; +// The special frame addresses. +enum SpecialFrameAddr { + // Dummy root of frame trie. + DummyRoot = 0, + // Represent all the addresses outside of current binary. + // This's also used to indicate the call stack should be truncated since this + // isn't a real call context the compiler will see. + ExternalAddr = 1, +}; + using RangesTy = std::vector>; struct BinaryFunction { @@ -386,6 +396,8 @@ } uint64_t getCallAddrFromFrameAddr(uint64_t FrameAddr) const { + if (FrameAddr == ExternalAddr) + return ExternalAddr; auto I = getIndexForAddr(FrameAddr); FrameAddr = I ? getAddressforIndex(I - 1) : 0; if (FrameAddr && addressIsCall(FrameAddr))