diff --git a/llvm/test/tools/llvm-profgen/Inputs/cs-interrupt.perfscript b/llvm/test/tools/llvm-profgen/Inputs/cs-interrupt.perfscript --- a/llvm/test/tools/llvm-profgen/Inputs/cs-interrupt.perfscript +++ b/llvm/test/tools/llvm-profgen/Inputs/cs-interrupt.perfscript @@ -4,13 +4,13 @@ 400634 400684 7f68c5788793 - 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 0x400645/0x4005ff/P/-/-/0 0x400637/0x400645/P/-/-/0 0x4005e9/0x400634/P/-/-/0 0x4005d7/0x4005e5/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 0x400645/0x4005ff/P/-/-/0 0x400637/0x400645/P/-/-/0 0x4005e9/0x400634/P/-/-/0 0x4005d7/0x4005e5/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 0x400645/0x4005ff/P/-/-/0 0x400637/0x400645/P/-/-/0 0xffffffff81c009d0/0x400634/P/-/-/0 0x40048a/0x40048e/P/-/-/0 + 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 0x400645/0x4005ff/P/-/-/0 0x400637/0x400645/P/-/-/0 0x4005e9/0x400634/P/-/-/0 0x4005d7/0x4005e5/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 0x400645/0x4005ff/P/-/-/0 0x400637/0x400645/P/-/-/0 0x4005e9/0x400634/P/-/-/0 0x4005d7/0x4005e5/P/-/-/0 0xffffffff81c009d0/0x4005c3/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 0x400645/0x4005ff/P/-/-/0 0x400637/0x400645/P/-/-/0 4005dc 400634 400684 7f68c5788793 - 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 0x400645/0x4005ff/P/-/-/0 0x400637/0x400645/P/-/-/0 0x4005e9/0x400634/P/-/-/0 0x4005d7/0x4005e5/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 0x400645/0x4005ff/P/-/-/0 0x400637/0x400645/P/-/-/0 0x4005e9/0x400634/P/-/-/0 0x4005d7/0x4005e5/P/-/-/0 0x400634/0xffffffff81c009d0/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 0x400645/0x4005ff/P/-/-/0 0x400637/0x400645/P/-/-/0 0x40048a/0x40048e/P/-/-/0 + 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 0x400645/0x4005ff/P/-/-/0 0x400637/0x400645/P/-/-/0 0x4005e9/0x400634/P/-/-/0 0x4005d7/0x4005e5/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 0x400645/0x4005ff/P/-/-/0 0x400637/0x400645/P/-/-/0 0x4005e9/0x400634/P/-/-/0 0x4005d7/0x4005e5/P/-/-/0 0x4005c3/0xffffffff81c009d0/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 0x400645/0x4005ff/P/-/-/0 0x400637/0x400645/P/-/-/0 // Transition 0xffffffff81c009d0/0x400634 is due to interrupt. Records after it, i.e, 0x40048a/0x40048e, should be ignored to avoid bogus instruction ranges. diff --git a/llvm/test/tools/llvm-profgen/callback-external-addr.test b/llvm/test/tools/llvm-profgen/callback-external-addr.test --- a/llvm/test/tools/llvm-profgen/callback-external-addr.test +++ b/llvm/test/tools/llvm-profgen/callback-external-addr.test @@ -1,6 +1,9 @@ ; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/callback-external-addr.perfscript --binary=%S/Inputs/callback-external-addr.perfbin --output=%t --skip-symbolization ; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-UNWINDER +; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/callback-external-addr.perfscript --binary=%S/Inputs/callback-external-addr.perfbin --output=%t --profile-summary-cold-count=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0 +; RUN: FileCheck %s --input-file %t + ; Test if call stack is correctly truncated. ; CHECK-UNWINDER-NOT: main:3 @ bar ; CHECK-UNWINDER-NOT: main:3 @ foo @@ -18,9 +21,18 @@ ; [foo:2 @ qux:2 @ callBeforeReturn] and [foo:2 @ qux:4 @ callAfterReturn] should exist ; which means the callback return won't interrupt the previous call stack + +; CHECK-UNWINDER: [] +; CHECK-UNWINDER: 0 +; CHECK-UNWINDER: 5 +; CHECK-UNWINDER: ffffffffffbfffff->690:13 +; CHECK-UNWINDER: ffffffffffbfffff->6a0:7 +; CHECK-UNWINDER: ffffffffffbfffff->715:7 +; CHECK-UNWINDER: ffffffffffbfffff->730:5 +; CHECK-UNWINDER: ffffffffffbfffff->7b0:5 ; CHECK-UNWINDER: [bar] ; CHECK-UNWINDER: 1 -; CHECK-UNWINDER: 690-69e:12 +; CHECK-UNWINDER: 690-69e:13 ; CHECK-UNWINDER: 0 ; CHECK-UNWINDER: [baz] ; CHECK-UNWINDER: 1 @@ -59,6 +71,34 @@ ; CHECK-UNWINDER: 1 ; CHECK-UNWINDER: 7bf->77d:5 +; CHECK: [foo:2 @ qux]:132:5 +; CHECK: 0: 5 +; CHECK: 1: 5 +; CHECK: 2: 6 callBeforeReturn:6 +; CHECK: 3: 7 +; CHECK: 4: 7 callAfterReturn:7 +; CHECK: 5: 6 +; CHECK: [bar]:91:13 +; CHECK: 0: 13 +; CHECK: 1: 13 +; CHECK: [main]:65:0 +; CHECK: 2.1: 5 +; CHECK: 2.2: 5 +; CHECK: 3: 5 +; CHECK: [foo]:60:5 +; CHECK: 0: 5 +; CHECK: 1: 5 +; CHECK: 2: 5 qux:5 +; CHECK: [baz]:49:7 +; CHECK: 0: 7 +; CHECK: 1: 7 +; CHECK: [foo:2 @ qux:4 @ callAfterReturn]:49:7 +; CHECK: 0: 7 +; CHECK: 1: 7 +; CHECK: [foo:2 @ qux:2 @ callBeforeReturn]:42:6 +; CHECK: 0: 6 +; CHECK: 1: 6 + ; libcallback.c ; clang -shared -fPIC -o libcallback.so libcallback.c diff --git a/llvm/test/tools/llvm-profgen/cs-interrupt.test b/llvm/test/tools/llvm-profgen/cs-interrupt.test --- a/llvm/test/tools/llvm-profgen/cs-interrupt.test +++ b/llvm/test/tools/llvm-profgen/cs-interrupt.test @@ -5,35 +5,57 @@ ; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cs-interrupt.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --profile-summary-cold-count=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0 ; RUN: FileCheck %s --input-file %t -; CHECK:[main:1 @ foo]:88:0 -; CHECK: 2: 5 -; CHECK: 3: 5 bar:5 -; CHECK:[main:1 @ foo:3 @ bar]:74:5 -; CHECK: 0: 5 -; CHECK: 1: 5 -; CHECK: 2: 3 -; CHECK: 5: 4 +; CHECK: [main:1 @ foo]:72:0 +; CHECK: 2: 4 +; CHECK: 3: 4 bar:4 +; CHECK: [main:1 @ foo:3 @ bar]:67:4 +; CHECK: 0: 4 +; CHECK: 1: 5 +; CHECK: 2: 3 +; CHECK: 5: 4 +; CHECK: [foo]:32:0 +; CHECK: 2: 2 +; CHECK: 3: 2 bar:2 +; CHECK: [bar]:8:2 +; CHECK: 0: 1 +; CHECK: 1: 1 -; CHECK-UNWINDER: [main:1 @ foo] +; CHECK-UNWINDER: [] +; CHECK-UNWINDER-NEXT: 0 +; CHECK-UNWINDER-NEXT: 1 +; CHECK-UNWINDER-NEXT: ffffffffffbfffff->5c3:1 +; CHECK-UNWINDER-NEXT: [bar] +; CHECK-UNWINDER-NEXT: 1 +; CHECK-UNWINDER-NEXT: 5b0-5c3:1 +; CHECK-UNWINDER-NEXT: 0 +; CHECK-UNWINDER-NEXT: [foo] +; CHECK-UNWINDER-NEXT: 2 +; CHECK-UNWINDER-NEXT: 5ff-62f:2 +; CHECK-UNWINDER-NEXT: 645-645:2 ; CHECK-UNWINDER-NEXT: 3 -; CHECK-UNWINDER-NEXT: 5ff-62f:5 +; CHECK-UNWINDER-NEXT: 62f->5b0:2 +; CHECK-UNWINDER-NEXT: 637->645:2 +; CHECK-UNWINDER-NEXT: 645->5ff:2 +; CHECK-UNWINDER-NEXT: [main:1 @ foo] +; CHECK-UNWINDER-NEXT: 3 +; CHECK-UNWINDER-NEXT: 5ff-62f:4 ; CHECK-UNWINDER-NEXT: 634-637:4 -; CHECK-UNWINDER-NEXT: 645-645:5 +; CHECK-UNWINDER-NEXT: 645-645:4 ; CHECK-UNWINDER-NEXT: 3 -; CHECK-UNWINDER-NEXT: 62f->5b0:5 -; CHECK-UNWINDER-NEXT: 637->645:5 -; CHECK-UNWINDER-NEXT: 645->5ff:5 +; CHECK-UNWINDER-NEXT: 62f->5b0:4 +; CHECK-UNWINDER-NEXT: 637->645:4 +; CHECK-UNWINDER-NEXT: 645->5ff:4 ; CHECK-UNWINDER-NEXT: [main:1 @ foo:3 @ bar] -; CHECK-UNWINDER-NEXT: 3 +; CHECK-UNWINDER-NEXT: 4 ; CHECK-UNWINDER-NEXT: 5b0-5c8:2 -; CHECK-UNWINDER-NEXT: 5b0-5d7:3 +; CHECK-UNWINDER-NEXT: 5b0-5d7:2 +; CHECK-UNWINDER-NEXT: 5c3-5d7:1 ; CHECK-UNWINDER-NEXT: 5e5-5e9:4 ; CHECK-UNWINDER-NEXT: 3 ; CHECK-UNWINDER-NEXT: 5c8->5dc:2 ; CHECK-UNWINDER-NEXT: 5d7->5e5:4 ; CHECK-UNWINDER-NEXT: 5e9->634:4 - ; original code: ; clang -O0 -g test.c -o a.out #include diff --git a/llvm/test/tools/llvm-profgen/inline-noprobe2.test b/llvm/test/tools/llvm-profgen/inline-noprobe2.test --- a/llvm/test/tools/llvm-profgen/inline-noprobe2.test +++ b/llvm/test/tools/llvm-profgen/inline-noprobe2.test @@ -1,5 +1,5 @@ ; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/artificial-branch.perfscript --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t --skip-symbolization --use-offset=0 -; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-ARTIFICIAL-BRANCH +; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-EXT-ADDR ; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/inline-noprobe2.perfscript --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t --skip-symbolization --use-offset=0 ; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-RAW-PROFILE ; RUN: llvm-profgen --format=text --unsymbolized-profile=%t --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 @@ -8,11 +8,12 @@ ; RUN: llvm-profgen --format=extbinary --perfscript=%S/Inputs/inline-noprobe2.perfscript --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t --populate-profile-symbol-list=1 ; RUN: llvm-profdata show -show-prof-sym-list -sample %t | FileCheck %s --check-prefix=CHECK-SYM-LIST -; CHECK-ARTIFICIAL-BRANCH: 2 -; CHECK-ARTIFICIAL-BRANCH: 400870-400870:2 -; CHECK-ARTIFICIAL-BRANCH: 400875-4008bf:1 -; CHECK-ARTIFICIAL-BRANCH: 1 -; CHECK-ARTIFICIAL-BRANCH: 4008bf->400870:2 +; CHECK-EXT-ADDR: 2 +; CHECK-EXT-ADDR-NEXT: 400870-400870:2 +; CHECK-EXT-ADDR-NEXT: 400875-4008bf:1 +; CHECK-EXT-ADDR-NEXT: 2 +; CHECK-EXT-ADDR-NEXT: 4008bf->400870:2 +; CHECK-EXT-ADDR-NEXT: ffffffffffffffff->400875:1 ; CHECK-SYM-LIST: Dump profile symbol list ; CHECK-SYM-LIST: main @@ -46,6 +47,22 @@ ;CHECK: 1: 6 ;CHECK: 2: 6 ;CHECK: 3: 6 +;CHECK: main:1362:0 +;CHECK: 0: 0 +;CHECK: 3: 0 +;CHECK: 4.1: 0 +;CHECK: 4.3: 0 +;CHECK: 5.1: 17 +;CHECK: 5.3: 17 +;CHECK: 6: 17 +;CHECK: 6.1: 17 +;CHECK: 6.3: 17 +;CHECK: 7: 0 +;CHECK: 8: 0 quick_sort:1 +;CHECK: 9: 0 +;CHECK: 11: 0 +;CHECK: 14: 0 +;CHECK: 65499: 0 ;CHECK: partition_pivot_last:1210:7 ;CHECK: 1: 6 ;CHECK: 2: 6 @@ -76,22 +93,6 @@ ;CHECK: 1: 5 ;CHECK: 2: 5 ;CHECK: 3: 5 -;CHECK: main:906:0 -;CHECK: 0: 0 -;CHECK: 3: 0 -;CHECK: 4.1: 0 -;CHECK: 4.3: 0 -;CHECK: 5.1: 11 -;CHECK: 5.3: 11 -;CHECK: 6: 11 -;CHECK: 6.1: 14 -;CHECK: 6.3: 11 -;CHECK: 7: 0 -;CHECK: 8: 0 quick_sort:1 -;CHECK: 9: 0 -;CHECK: 11: 0 -;CHECK: 14: 0 -;CHECK: 65499: 0 ;CHECK: quick_sort:903:25 ;CHECK: 1: 24 ;CHECK: 2: 12 partition_pivot_last:7 partition_pivot_first:5 diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h --- a/llvm/tools/llvm-profgen/PerfReader.h +++ b/llvm/tools/llvm-profgen/PerfReader.h @@ -84,19 +84,12 @@ struct LBREntry { uint64_t Source = 0; uint64_t Target = 0; - // An artificial branch stands for a series of consecutive branches starting - // from the current binary with a transition through external code and - // eventually landing back in the current binary. - bool IsArtificial = false; - LBREntry(uint64_t S, uint64_t T, bool I) - : Source(S), Target(T), IsArtificial(I) {} + LBREntry(uint64_t S, uint64_t T) : Source(S), Target(T) {} #ifndef NDEBUG void print() const { dbgs() << "from " << format("%#010x", Source) << " to " << format("%#010x", Target); - if (IsArtificial) - dbgs() << " Artificial"; } #endif }; @@ -276,7 +269,7 @@ // When we take a stack sample, ideally the sampling distance between the // leaf IP of stack and the last LBR target shouldn't be very large. // Use a heuristic size (0x100) to filter out broken records. - if (LeafAddr < LBRLeaf || LeafAddr >= LBRLeaf + 0x100) { + if (LeafAddr < LBRLeaf || LeafAddr - LBRLeaf >= 0x100) { WithColor::warning() << "Bogus trace: stack tip = " << format("%#010x", LeafAddr) << ", LBR tip = " << format("%#010x\n", LBRLeaf); @@ -480,11 +473,32 @@ uint64_t NumMismatchedExtCallBranch = 0; private: + bool isSourceExternal(UnwindState &State) const { + return State.getCurrentLBRSource() == ExternalAddr; + } + + bool isTargetExternal(UnwindState &State) const { + return State.getCurrentLBRTarget() == ExternalAddr; + } + + bool isReturnFromExternal(UnwindState &State) const { + // Detect a return by checking if it's the next inst of call inst. + uint64_t CallAddr = + Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()); + return (CallAddr != 0); + } + bool isCallState(UnwindState &State) const { // The tail call frame is always missing here in stack sample, we will // use a specific tail call tracker to infer it. - return isValidState(State) && - Binary->addressIsCall(State.getCurrentLBRSource()); + if (!isValidState(State)) + return false; + + if (Binary->addressIsCall(State.getCurrentLBRSource())) + return true; + + // Check if the call source is from external code. + return isSourceExternal(State) && !isReturnFromExternal(State); } bool isReturnState(UnwindState &State) const { @@ -493,19 +507,11 @@ // Simply check addressIsReturn, as ret is always reliable, both for // regular call and tail call. - if (!Binary->addressIsReturn(State.getCurrentLBRSource())) - return false; + if (Binary->addressIsReturn(State.getCurrentLBRSource())) + return true; - // In a callback case, a return from internal code, say A, to external - // runtime can happen. The external runtime can then call back to - // another internal routine, say B. Making an artificial branch that - // looks like a return from A to B can confuse the unwinder to treat - // the instruction before B as the call instruction. Here we detect this - // case if the return target is not the next inst of call inst, then we just - // do not treat it as a return. - uint64_t CallAddr = - Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()); - return (CallAddr != 0); + // Check if the return source is from external code. + return isSourceExternal(State) && isReturnFromExternal(State); } bool isValidState(UnwindState &State) const { return !State.Invalid; } diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp --- a/llvm/tools/llvm-profgen/PerfReader.cpp +++ b/llvm/tools/llvm-profgen/PerfReader.cpp @@ -53,27 +53,6 @@ void VirtualUnwinder::unwindCall(UnwindState &State) { uint64_t Source = State.getCurrentLBRSource(); - // An artificial return should push an external frame and an artificial call - // will match it and pop the external frame so that the context before and - // after the external call will be the same. - if (State.getCurrentLBR().IsArtificial) { - NumExtCallBranch++; - // A return is matched and pop the external frame. - if (State.getParentFrame()->isExternalFrame()) { - State.popFrame(); - } else { - // An artificial return is missing, it happens that the sample is just hit - // in the middle of the external code. In this case, the leading branch is - // a call to external, we just keep unwinding use a context-less stack. - if (State.getParentFrame() != State.getDummyRootPtr()) - NumMissingExternalFrame++; - State.clearCallStack(); - State.pushFrame(Source); - State.InstPtr.update(Source); - return; - } - } - auto *ParentFrame = State.getParentFrame(); // The 2nd frame after leaf could be missing if stack sample is // taken when IP is within prolog/epilog, as frame chain isn't @@ -85,9 +64,11 @@ ParentFrame->Address != Source) { State.switchToFrame(Source); if (ParentFrame != State.getDummyRootPtr()) { - if (State.getCurrentLBR().IsArtificial) + if (Source == ExternalAddr) { NumMismatchedExtCallBranch++; - else + State.clearCallStack(); + State.pushFrame(Source); + } else NumMismatchedProEpiBranch++; } } else { @@ -100,11 +81,17 @@ InstructionPointer &IP = State.InstPtr; uint64_t Target = State.getCurrentLBRTarget(); uint64_t End = IP.Address; + + // Skip recording LBR trace with external address. + if (End == ExternalAddr || Target == ExternalAddr) + return; + if (Target > End) { // Skip unwinding the rest of LBR trace when a bogus range is seen. State.setInvalid(); return; } + if (Binary->usePseudoProbes()) { // We don't need to top frame probe since it should be extracted // from the range. @@ -150,19 +137,6 @@ const LBREntry &LBR = State.getCurrentLBR(); uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target); State.switchToFrame(CallAddr); - // Push an external frame for the case of returning to external - // address(callback), later if an aitificial call is matched and it will be - // popped up. This is to 1)avoid context being interrupted by callback, - // context before or after the callback should be the same. 2) the call stack - // of function called by callback should be truncated which is done during - // recording the context on trie. For example: - // main (call)--> foo (call)--> callback (call)--> bar (return)--> callback - // (return)--> foo (return)--> main - // Context for bar should not include main and foo. - // For the code of foo, the context of before and after callback should both - // be [foo, main]. - if (LBR.IsArtificial) - State.pushFrame(ExternalAddr); State.pushFrame(LBR.Source); State.InstPtr.update(LBR.Source); } @@ -179,8 +153,6 @@ std::shared_ptr KeyStr = std::make_shared(); KeyStr->Context = Binary->getExpandedContext(Stack, KeyStr->WasLeafInlined); - if (KeyStr->Context.empty()) - return nullptr; return KeyStr; } @@ -262,8 +234,16 @@ void VirtualUnwinder::recordBranchCount(const LBREntry &Branch, UnwindState &State, uint64_t Repeat) { - if (Branch.IsArtificial || Branch.Target == ExternalAddr) + if (Branch.Target == ExternalAddr) + return; + + // Record external-call-internal pattern on the trie root, it later can be + // used for generating head samples. + if (Branch.Source == ExternalAddr) { + State.getDummyRootPtr()->recordBranchCount(Branch.Source, Branch.Target, + Repeat); return; + } if (Binary->usePseudoProbes()) { // Same as recordRangeCount, We don't need to top frame probe since we will @@ -300,7 +280,6 @@ // Save the LBR branch before it gets unwound. const LBREntry &Branch = State.getCurrentLBR(); - if (isCallState(State)) { // Unwind calls - we know we encountered call if LBR overlaps with // transition between leaf the 2nd frame. Note that for calls that @@ -311,7 +290,7 @@ // Unwind returns - check whether the IP is indeed at a return // instruction unwindReturn(State); - } else if (isValidState(State)) { + } else if (isValidState(State) && !isTargetExternal(State)) { // Unwind branches // For regular intra function branches, we only need to record branch // with context. For an artificial branch cross function boundaries, we @@ -556,11 +535,10 @@ } Index = 1; } + // Now extract LBR samples - note that we do not reverse the // LBR entry order so we can unwind the sample stack as we walk // through LBR entries. - uint64_t PrevTrDst = 0; - while (Index < Records.size()) { auto &Token = Records[Index++]; if (Token.size() == 0) @@ -580,68 +558,16 @@ bool SrcIsInternal = Binary->addressIsCode(Src); bool DstIsInternal = Binary->addressIsCode(Dst); - bool IsExternal = !SrcIsInternal && !DstIsInternal; - bool IsIncoming = !SrcIsInternal && DstIsInternal; - bool IsOutgoing = SrcIsInternal && !DstIsInternal; - bool IsArtificial = false; - - // Ignore branches outside the current binary. - if (IsExternal) { - if (!PrevTrDst && !LBRStack.empty()) { - WithColor::warning() - << "Invalid transfer to external code in LBR record at line " - << TraceIt.getLineNumber() << ": " << TraceIt.getCurrentLine() - << "\n"; - } - // Do not ignore the entire samples, the remaining LBR can still be - // unwound using a context-less stack. + if (!SrcIsInternal) + Src = ExternalAddr; + if (!DstIsInternal) + Dst = ExternalAddr; + // Filter external-to-external case to reduce LBR trace size. + if (!SrcIsInternal && !DstIsInternal) continue; - } - - if (IsOutgoing) { - if (!PrevTrDst) { - // This is a leading outgoing LBR, we should keep processing the LBRs. - if (LBRStack.empty()) { - NumLeadingOutgoingLBR++; - // Record this LBR since current source and next LBR' target is still - // a valid range. - LBRStack.emplace_back(LBREntry(Src, ExternalAddr, false)); - continue; - } - // This is middle unpaired outgoing jump which is likely due to - // interrupt or incomplete LBR trace. Ignore current and subsequent - // entries since they are likely in different contexts. - break; - } - - // For transition to external code, group the Source with the next - // availabe transition target. - Dst = PrevTrDst; - PrevTrDst = 0; - IsArtificial = true; - } else { - if (PrevTrDst) { - // If we have seen an incoming transition from external code to internal - // code, but not a following outgoing transition, the incoming - // transition is likely due to interrupt which is usually unpaired. - // Ignore current and subsequent entries since they are likely in - // different contexts. - break; - } - - if (IsIncoming) { - // For transition from external code (such as dynamic libraries) to - // the current binary, keep track of the branch target which will be - // grouped with the Source of the last transition from the current - // binary. - PrevTrDst = Dst; - continue; - } - } // TODO: filter out buggy duplicate branches on Skylake - - LBRStack.emplace_back(LBREntry(Src, Dst, IsArtificial)); + LBRStack.emplace_back(LBREntry(Src, Dst)); } TraceIt.advance(); return !LBRStack.empty(); @@ -921,27 +847,22 @@ void PerfScriptReader::computeCounterFromLBR(const PerfSample *Sample, uint64_t Repeat) { SampleCounter &Counter = SampleCounters.begin()->second; - uint64_t EndOffeset = 0; + uint64_t ExternalOffset = Binary->virtualAddrToOffset(ExternalAddr); + uint64_t EndOffeset = ExternalOffset; for (const LBREntry &LBR : Sample->LBRStack) { - assert(LBR.Source != ExternalAddr && - "Branch' source should not be an external address, it should be " - "converted to aritificial branch."); uint64_t SourceOffset = Binary->virtualAddrToOffset(LBR.Source); - uint64_t TargetOffset = LBR.Target == static_cast(ExternalAddr) - ? static_cast(ExternalAddr) - : Binary->virtualAddrToOffset(LBR.Target); + uint64_t TargetOffset = Binary->virtualAddrToOffset(LBR.Target); - if (!LBR.IsArtificial && TargetOffset != ExternalAddr) { + if (TargetOffset != ExternalOffset) { Counter.recordBranchCount(SourceOffset, TargetOffset, Repeat); } // If this not the first LBR, update the range count between TO of current // LBR and FROM of next LBR. uint64_t StartOffset = TargetOffset; - if (EndOffeset != 0) { - if (StartOffset <= EndOffeset) - Counter.recordRangeCount(StartOffset, EndOffeset, Repeat); - } + if (EndOffeset != ExternalOffset && StartOffset != ExternalOffset && + StartOffset <= EndOffeset) + Counter.recordRangeCount(StartOffset, EndOffeset, Repeat); EndOffeset = SourceOffset; } } diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h --- a/llvm/tools/llvm-profgen/ProfileGenerator.h +++ b/llvm/tools/llvm-profgen/ProfileGenerator.h @@ -304,6 +304,7 @@ FunctionSamples &FunctionProfile, const BranchSample &BranchCounters); void populateInferredFunctionSamples(); + void populateHeadSamplesForExternalCall(const BranchSample &BranchCounters); void generateProbeBasedProfile(); diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp --- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp +++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp @@ -765,6 +765,12 @@ for (const auto &CI : *SampleCounters) { const auto *CtxKey = cast(CI.first.getPtr()); + // Sample context will be empty if the jump is an external-call-internal + // pattern, the head samples should be added for the internal function. + if (CtxKey->Context.empty()) { + populateHeadSamplesForExternalCall(CI.second.BranchCounter); + continue; + } // Get or create function profile for the range FunctionSamples &FunctionProfile = getFunctionProfileForContext(CtxKey->Context, CtxKey->WasLeafInlined); @@ -908,6 +914,24 @@ } } +void CSProfileGenerator::populateHeadSamplesForExternalCall( + const BranchSample &BranchCounters) { + for (auto Entry : BranchCounters) { + uint64_t TargetOffset = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForOffset(TargetOffset); + if (CalleeName.size() == 0) + continue; + // Add head samples for callee. + SampleContextFrameVector CalleeCtx; + CalleeCtx.emplace_back(CalleeName, LineLocation(0, 0)); + FunctionSamples &CalleeProfile = getFunctionProfileForContext(CalleeCtx); + CalleeProfile.addHeadSamples(Count); + } +} + void CSProfileGenerator::postProcessProfiles() { // Compute hot/cold threshold based on profile. This will be used for cold // context profile merging/trimming. diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h --- a/llvm/tools/llvm-profgen/ProfiledBinary.h +++ b/llvm/tools/llvm-profgen/ProfiledBinary.h @@ -72,13 +72,14 @@ }; // The special frame addresses. -enum SpecialFrameAddr { +enum SpecialFrameAddr : uint64_t { // Dummy root of frame trie. DummyRoot = 0, - // Represent all the addresses outside of current binary. + // Represent all the addresses outside of current binary. Use a large value to + // avoid integer overflow. // This's also used to indicate the call stack should be truncated since this // isn't a real call context the compiler will see. - ExternalAddr = 1, + ExternalAddr = UINT64_MAX, }; using RangesTy = std::vector>; diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp --- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp +++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp @@ -256,6 +256,8 @@ ProfiledBinary::getExpandedContext(const SmallVectorImpl &Stack, bool &WasLeafInlined) { SampleContextFrameVector ContextVec; + if (Stack.empty()) + return ContextVec; // Process from frame root to leaf for (auto Address : Stack) { uint64_t Offset = virtualAddrToOffset(Address);