diff --git a/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml @@ -0,0 +1,46 @@ +# RUN: llvm-xray account -d %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s +# RUN: llvm-xray account -d -recursive-calls-only %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s + +--- +header: + version: 1 + type: 0 + constant-tsc: true + nonstop-tsc: true + cycle-frequency: 0 +records: +# Here we reconstruct the following call trace: +# +# f1() +# f2() +# f3() +# f2() +# +# But we find that we're missing an exit record for f2() because it's +# tail-called f3(). We make sure that if we see a trace like this that we can +# deduce tail calls, and account the time (potentially wrongly) to f2() when +# f1() exits. That is because we don't go back to f3()'s entry record to +# properly do the math on the timing of f2(). +# +# As a result, we can deduce that f2() is not recursive here. +# +# Note that by default, tail/sibling call deduction is disabled, and is enabled +# with a flag "-d" or "-deduce-sibling-calls". +# + - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10000 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 10002 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit, tsc: 10003 } + - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 10004 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10005 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-exit, tsc: 10006 } +... + +# ALL: Functions with latencies: 3 +# ALL-NEXT: funcid count [ min, med, 90p, 99p, max] sum function +# ALL-NEXT: 1 1 [ 4.000000, 4.000000, 4.000000, 4.000000, 4.000000] 4.000000 :0:0: @(1) +# ALL-NEXT: 2 2 [ 1.000000, 3.000000, 3.000000, 3.000000, 3.000000] 4.000000 :0:0: @(2) +# ALL-NEXT: 3 1 [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000] 1.000000 :0:0: @(3) + +# RECURSIVE: Functions with latencies: 0 +# RECURSIVE-NEXT: funcid count [ min, med, 90p, 99p, max] sum function diff --git a/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml @@ -0,0 +1,34 @@ +# RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s +# RUN: llvm-xray account -recursive-calls-only %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s + +--- +header: + version: 1 + type: 0 + constant-tsc: true + nonstop-tsc: true + cycle-frequency: 2601000000 +records: + - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 0 } + - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 100000000 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 200000000 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-exit, tsc: 300000000 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 400000000 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 500000000 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 600000000 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit, tsc: 700000000 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit, tsc: 800000000 } + - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-exit, tsc: 900000000 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 1000000000 } + - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit, tsc: 1100000000 } +... + +# ALL: Functions with latencies: 3 +# ALL-NEXT: funcid count [ min, med, 90p, 99p, max] sum function +# ALL-NEXT: 1 1 [ 0.038447, 0.038447, 0.038447, 0.038447, 0.038447] 0.038447 :0:0: @(1) +# ALL-NEXT: 2 2 [ 0.038447, 0.192234, 0.192234, 0.192234, 0.192234] 0.230681 :0:0: @(2) +# ALL-NEXT: 3 3 [ 0.038447, 0.038447, 0.115340, 0.115340, 0.115340] 0.192234 :0:0: @(3) + +# RECURSIVE: Functions with latencies: 1 +# RECURSIVE-NEXT: funcid count [ min, med, 90p, 99p, max] sum function +# RECURSIVE-NEXT: 3 2 [ 0.038447, 0.115340, 0.115340, 0.115340, 0.115340] 0.153787 :0:0: @(3) diff --git a/llvm/tools/llvm-xray/xray-account.h b/llvm/tools/llvm-xray/xray-account.h --- a/llvm/tools/llvm-xray/xray-account.h +++ b/llvm/tools/llvm-xray/xray-account.h @@ -18,6 +18,7 @@ #include #include "func-id-helper.h" +#include "llvm/ADT/Bitfields.h" #include "llvm/Support/Program.h" #include "llvm/Support/raw_ostream.h" #include "llvm/XRay/XRayRecord.h" @@ -33,7 +34,19 @@ PerThreadMinMaxTSCMap; typedef llvm::DenseMap> PerCPUMinMaxTSCMap; - typedef llvm::SmallVector, 32> FunctionStack; + struct FunctionStack { + llvm::SmallVector, 32> Stack; + class RecursionStatus { + uint32_t Storage = 0; + using Depth = Bitfield::Element; // Low 31 bits. + using IsRecursive = Bitfield::Element; // Sign bit. + public: + RecursionStatus &operator++(); + RecursionStatus &operator--(); + bool isRecursive() const; + }; + Optional> RecursionDepth; + }; typedef llvm::DenseMap PerThreadFunctionStackMap; private: @@ -43,6 +56,7 @@ PerCPUMinMaxTSCMap PerCPUMinMaxTSC; FuncIdConversionHelper &FuncIdHelper; + bool RecursiveCallsOnly = false; bool DeduceSiblingCalls = false; uint64_t CurrentMaxTSC = 0; @@ -52,8 +66,9 @@ public: explicit LatencyAccountant(FuncIdConversionHelper &FuncIdHelper, - bool DeduceSiblingCalls) - : FuncIdHelper(FuncIdHelper), DeduceSiblingCalls(DeduceSiblingCalls) {} + bool RecursiveCallsOnly, bool DeduceSiblingCalls) + : FuncIdHelper(FuncIdHelper), RecursiveCallsOnly(RecursiveCallsOnly), + DeduceSiblingCalls(DeduceSiblingCalls) {} const FunctionLatencyMap &getFunctionLatencies() const { return FunctionLatencies; diff --git a/llvm/tools/llvm-xray/xray-account.cpp b/llvm/tools/llvm-xray/xray-account.cpp --- a/llvm/tools/llvm-xray/xray-account.cpp +++ b/llvm/tools/llvm-xray/xray-account.cpp @@ -35,6 +35,9 @@ cl::sub(Account), cl::init(false)); static cl::alias AccountKeepGoing2("k", cl::aliasopt(AccountKeepGoing), cl::desc("Alias for -keep_going")); +static cl::opt AccountRecursiveCallsOnly( + "recursive-calls-only", cl::desc("Only count the calls that are recursive"), + cl::sub(Account), cl::init(false)); static cl::opt AccountDeduceSiblingCalls( "deduce-sibling-calls", cl::desc("Deduce sibling calls when unrolling function call stacks"), @@ -126,6 +129,32 @@ } // namespace +using RecursionStatus = LatencyAccountant::FunctionStack::RecursionStatus; +RecursionStatus &RecursionStatus::operator++() { + auto Depth = Bitfield::get(Storage); + assert(Depth >= 0 && Depth < std::numeric_limits::max()); + ++Depth; + Bitfield::set(Storage, Depth); // ++Storage + // Did this function just (maybe indirectly) call itself the first time? + if (!isRecursive() && Depth == 2) // Storage == 2 / Storage s> 1 + Bitfield::set(Storage, + true); // Storage |= INT_MIN + return *this; +} +RecursionStatus &RecursionStatus::operator--() { + auto Depth = Bitfield::get(Storage); + assert(Depth > 0); + --Depth; + Bitfield::set(Storage, Depth); // --Storage + // Did we leave a function that previouly (maybe indirectly) called itself? + if (isRecursive() && Depth == 0) // Storage == INT_MIN + Bitfield::set(Storage, false); // Storage = 0 + return *this; +} +bool RecursionStatus::isRecursive() const { + return Bitfield::get(Storage); // Storage s< 0 +} + bool LatencyAccountant::accountRecord(const XRayRecord &Record) { setMinMax(PerThreadMinMaxTSC[Record.TId], Record.TSC); setMinMax(PerCPUMinMaxTSC[Record.CPU], Record.TSC); @@ -137,6 +166,8 @@ return false; auto &ThreadStack = PerThreadFunctionStack[Record.TId]; + if (RecursiveCallsOnly && !ThreadStack.RecursionDepth) + ThreadStack.RecursionDepth.emplace(); switch (Record.Type) { case RecordTypes::CUSTOM_EVENT: case RecordTypes::TYPED_EVENT: @@ -144,18 +175,24 @@ return true; case RecordTypes::ENTER: case RecordTypes::ENTER_ARG: { - ThreadStack.emplace_back(Record.FuncId, Record.TSC); + ThreadStack.Stack.emplace_back(Record.FuncId, Record.TSC); + if (ThreadStack.RecursionDepth) + ++(*ThreadStack.RecursionDepth)[Record.FuncId]; break; } case RecordTypes::EXIT: case RecordTypes::TAIL_EXIT: { - if (ThreadStack.empty()) + if (ThreadStack.Stack.empty()) return false; - if (ThreadStack.back().first == Record.FuncId) { - const auto &Top = ThreadStack.back(); - recordLatency(Top.first, diff(Top.second, Record.TSC)); - ThreadStack.pop_back(); + if (ThreadStack.Stack.back().first == Record.FuncId) { + const auto &Top = ThreadStack.Stack.back(); + if (!ThreadStack.RecursionDepth || + (*ThreadStack.RecursionDepth)[Top.first].isRecursive()) + recordLatency(Top.first, diff(Top.second, Record.TSC)); + if (ThreadStack.RecursionDepth) + --(*ThreadStack.RecursionDepth)[Top.first]; + ThreadStack.Stack.pop_back(); break; } @@ -164,11 +201,11 @@ // Look for the parent up the stack. auto Parent = - std::find_if(ThreadStack.rbegin(), ThreadStack.rend(), + std::find_if(ThreadStack.Stack.rbegin(), ThreadStack.Stack.rend(), [&](const std::pair &E) { return E.first == Record.FuncId; }); - if (Parent == ThreadStack.rend()) + if (Parent == ThreadStack.Stack.rend()) return false; // Account time for this apparently sibling call exit up the stack. @@ -199,11 +236,17 @@ // complexity to do correctly (need to backtrack, etc.). // // FIXME: Potentially implement the more complex deduction algorithm? - auto I = std::next(Parent).base(); - for (auto &E : make_range(I, ThreadStack.end())) { - recordLatency(E.first, diff(E.second, Record.TSC)); + auto R = make_range(std::next(Parent).base(), ThreadStack.Stack.end()); + for (auto &E : R) { + if (!ThreadStack.RecursionDepth || + (*ThreadStack.RecursionDepth)[E.first].isRecursive()) + recordLatency(E.first, diff(E.second, Record.TSC)); + } + for (auto &Top : reverse(R)) { + if (ThreadStack.RecursionDepth) + --(*ThreadStack.RecursionDepth)[Top.first]; + ThreadStack.Stack.pop_back(); } - ThreadStack.erase(I, ThreadStack.end()); break; } } @@ -425,7 +468,8 @@ symbolize::LLVMSymbolizer Symbolizer; llvm::xray::FuncIdConversionHelper FuncIdHelper(AccountInstrMap, Symbolizer, FunctionAddresses); - xray::LatencyAccountant FCA(FuncIdHelper, AccountDeduceSiblingCalls); + xray::LatencyAccountant FCA(FuncIdHelper, AccountRecursiveCallsOnly, + AccountDeduceSiblingCalls); auto TraceOrErr = loadTraceFile(AccountInput); if (!TraceOrErr) return joinErrors( @@ -447,12 +491,12 @@ << '\n'; for (const auto &ThreadStack : FCA.getPerThreadFunctionStack()) { errs() << "Thread ID: " << ThreadStack.first << "\n"; - if (ThreadStack.second.empty()) { + if (ThreadStack.second.Stack.empty()) { errs() << " (empty stack)\n"; continue; } - auto Level = ThreadStack.second.size(); - for (const auto &Entry : llvm::reverse(ThreadStack.second)) + auto Level = ThreadStack.second.Stack.size(); + for (const auto &Entry : llvm::reverse(ThreadStack.second.Stack)) errs() << " #" << Level-- << "\t" << FuncIdHelper.SymbolOrNumber(Entry.first) << '\n'; }