Index: tools/llvm-xray/xray-stacks.cc =================================================================== --- tools/llvm-xray/xray-stacks.cc +++ tools/llvm-xray/xray-stacks.cc @@ -66,8 +66,52 @@ cl::desc("Aggregate stack times across threads"), cl::sub(Stack), cl::init(false)); -/// A helper struct to work with formatv and XRayRecords. Makes it easier to use -/// instrumentation map names or addresses in formatted output. +static cl::opt + DumpAllStacks("all-stacks", + cl::desc("Dump sum of timings for all stacks. " + "By default separates stacks per-thread."), + cl::sub(Stack), cl::init(false)); +static cl::alias DumpAllStacksShort("all", cl::aliasopt(DumpAllStacks), + cl::desc("Alias for -all-stacks"), + cl::sub(Stack)); + +// TODO(kpw): Add other interesting formats. Perhaps chrome trace viewer format +// possibly with aggregations or just a linear trace of timings. +enum StackOutputFormat { HUMAN, FLAMETOOL }; + +static cl::opt StacksOutputFormat( + "stack-format", + cl::desc("The format that output stacks should be " + "output in. Only applies with all-stacks."), + cl::values( + clEnumValN(HUMAN, "human", + "Human readable output. Only valid without -all-stacks."), + clEnumValN(FLAMETOOL, "flame", + "Format consumable by Brendan Gregg's FlameGraph tool. " + "Only valid with -all-stacks.")), + cl::sub(Stack), cl::init(HUMAN)); + +// Types of values for each stack in a CallTrie. +enum class AggregationType { + TOTAL_TIME, // The total time spent in a stack and its callees. + INVOCATION_COUNT // The number of times the stack was invoked. +}; + +static cl::opt RequestedAggregation( + "aggregation-type", + cl::desc("The type of aggregation to do on call stacks."), + cl::values( + clEnumValN( + AggregationType::TOTAL_TIME, "time", + "Capture the total time spent in an all invocations of a stack."), + clEnumValN(AggregationType::INVOCATION_COUNT, "count", + "Capture the number of times a stack was invoked. " + "In flamegraph mode, this count also includes invocations " + "of all callees.")), + cl::sub(Stack), cl::init(AggregationType::TOTAL_TIME)); + +/// A helper struct to work with formatv and XRayRecords. Makes it easier to +/// use instrumentation map names or addresses in formatted output. struct format_xray_record : public FormatAdapter { explicit format_xray_record(XRayRecord record, const FuncIdConversionHelper &conv) @@ -274,10 +318,45 @@ return Node; } +template +std::size_t GetValueForStack(const TrieNode *Node); + +// When computing total time spent in a stack, we're adding the timings from +// its callees and the timings from when it was a leaf. +template <> +std::size_t +GetValueForStack(const TrieNode *Node) { + auto TopSum = std::accumulate(Node->TerminalDurations.begin(), + Node->TerminalDurations.end(), 0uLL); + return std::accumulate(Node->IntermediateDurations.begin(), + Node->IntermediateDurations.end(), TopSum); +} + +// Calculates how many times a function was invoked. +// TODO(kpw): Hook up option to produce stacks +template <> +std::size_t +GetValueForStack(const TrieNode *Node) { + return Node->TerminalDurations.size() + Node->IntermediateDurations.size(); +} + +// Make sure there are implementations for each enum value. +template struct DependentFalseType : std::false_type {}; + +template +std::size_t GetValueForStack(const TrieNode *Node) { + static_assert(DependentFalseType::value, + "No implementation found for aggregation type provided."); + return 0; +} + class StackTrie { + // Avoid the magic number of 4 propagated through the code with an alias. + // We use this SmallVector to track the root nodes in a call graph. + using RootVector = SmallVector; // We maintain pointers to the roots of the tries we see. - DenseMap> Roots; + DenseMap Roots; // We make sure all the nodes are accounted for in this list. std::forward_list NodeStore; @@ -439,11 +518,23 @@ } } + /// Prints timing sums for each stack in each threads. + template + void printAllPerThread(raw_ostream &OS, FuncIdConversionHelper &FN, + StackOutputFormat format) { + for (auto iter : Roots) { + uint32_t threadId = iter.first; + RootVector &perThreadRoots = iter.second; + bool reportThreadId = true; + printAll(OS, FN, perThreadRoots, threadId, reportThreadId); + } + } + /// Prints top stacks from looking at all the leaves and ignoring thread IDs. /// Stacks that consist of the same function IDs but were called in different /// thread IDs are not considered unique in this printout. void printIgnoringThreads(raw_ostream &OS, FuncIdConversionHelper &FN) { - SmallVector RootValues; + RootVector RootValues; // Function to pull the values out of a map iterator. using RootsType = decltype(Roots.begin())::value_type; @@ -459,30 +550,87 @@ print(OS, FN, RootValues); } - /// Merges the trie by thread id before printing top stacks. - void printAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN) { - std::forward_list AggregatedNodeStore; - SmallVector RootValues; + /// Creates a merged list of Tries for unique stacks that disregards their + /// thread IDs. + RootVector mergeAcrossThreads(std::forward_list &NodeStore) { + RootVector MergedByThreadRoots; for (auto MapIter : Roots) { const auto &RootNodeVector = MapIter.second; for (auto *Node : RootNodeVector) { - auto MaybeFoundIter = find_if(RootValues, [Node](TrieNode *elem) { - return Node->FuncId == elem->FuncId; - }); - if (MaybeFoundIter == RootValues.end()) { - RootValues.push_back(Node); + auto MaybeFoundIter = + find_if(MergedByThreadRoots, [Node](TrieNode *elem) { + return Node->FuncId == elem->FuncId; + }); + if (MaybeFoundIter == MergedByThreadRoots.end()) { + MergedByThreadRoots.push_back(Node); } else { - RootValues.push_back(mergeTrieNodes(**MaybeFoundIter, *Node, nullptr, - AggregatedNodeStore)); - RootValues.erase(MaybeFoundIter); + MergedByThreadRoots.push_back( + mergeTrieNodes(**MaybeFoundIter, *Node, nullptr, NodeStore)); + MergedByThreadRoots.erase(MaybeFoundIter); } } } - print(OS, FN, RootValues); + return MergedByThreadRoots; + } + + /// Print timing sums for all stacks merged by Thread ID. + template + void printAllAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN, + StackOutputFormat format) { + std::forward_list AggregatedNodeStore; + RootVector MergedByThreadRoots = mergeAcrossThreads(AggregatedNodeStore); + bool reportThreadId = false; + printAll(OS, FN, MergedByThreadRoots, + /*threadId*/ 0, reportThreadId); + } + + /// Merges the trie by thread id before printing top stacks. + void printAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN) { + std::forward_list AggregatedNodeStore; + RootVector MergedByThreadRoots = mergeAcrossThreads(AggregatedNodeStore); + print(OS, FN, MergedByThreadRoots); + } + + // TODO(kpw): Add a format option when more than one are supported. + template + void printAll(raw_ostream &OS, FuncIdConversionHelper &FN, + RootVector RootValues, uint32_t ThreadId, bool ReportThread) { + for (const auto *N : RootValues) { + SmallVector S; + S.emplace_back(N); + while (!S.empty()) { + auto *Top = S.pop_back_val(); + printSingleStack(OS, FN, ReportThread, ThreadId, Top); + for (const auto *C : Top->Callees) + S.push_back(C); + } + } + } + + /// Prints values for stacks in a format consumable for the flamegraph.pl + /// tool. This is a line based format that lists each level in the stack + /// hierarchy in a semicolon delimited form followed by a space and a numeric + /// value. If breaking down by thread, the thread ID will be added as the + /// root level of the stack. + template + void printSingleStack(raw_ostream &OS, FuncIdConversionHelper &Converter, + bool ReportThread, uint32_t ThreadId, + const TrieNode *Node) { + if (ReportThread) + OS << "thread_" << ThreadId << ";"; + std::stack lineage{}; + lineage.push(Node); + while (lineage.top()->Parent != nullptr) + lineage.push(lineage.top()->Parent); + while (!lineage.empty()) { + OS << Converter.SymbolOrNumber(lineage.top()->FuncId) << ";"; + lineage.pop(); + } + OS << " " << GetValueForStack(Node) << "\n"; } void print(raw_ostream &OS, FuncIdConversionHelper &FN, - SmallVector RootValues) { + RootVector RootValues) { // Go through each of the roots, and traverse the call stack, producing the // aggregates as you go along. Remember these aggregates and stacks, and // show summary statistics about: @@ -502,7 +650,7 @@ S.emplace_back(N); while (!S.empty()) { - auto Top = S.pop_back_val(); + auto *Top = S.pop_back_val(); // We only start printing the stack (by walking up the parent pointers) // when we get to a leaf function. @@ -587,6 +735,18 @@ "that aggregates threads."), std::make_error_code(std::errc::invalid_argument)); + if (!DumpAllStacks && StacksOutputFormat != HUMAN) { + return make_error( + Twine("Can't specify a non-human format without -all-stacks."), + std::make_error_code(std::errc::invalid_argument)); + } + if (DumpAllStacks && StacksOutputFormat == HUMAN) { + return make_error( + Twine("You must specify a non-human format when reporting with " + "-all-stacks."), + std::make_error_code(std::errc::invalid_argument)); + } + symbolize::LLVMSymbolizer::Options Opts( symbolize::FunctionNameKind::LinkageName, true, true, false, ""); symbolize::LLVMSymbolizer Symbolizer(Opts); @@ -625,6 +785,44 @@ "No instrumented calls were accounted in the input file.", make_error_code(errc::result_out_of_range)); } + + // Report the stacks in a long form mode for another tool to analyze. + if (DumpAllStacks) { + if (AggregateThreads) { + switch (RequestedAggregation) { + case AggregationType::TOTAL_TIME: + ST.printAllAggregatingThreads( + outs(), FuncIdHelper, StacksOutputFormat); + break; + case AggregationType::INVOCATION_COUNT: + ST.printAllAggregatingThreads( + outs(), FuncIdHelper, StacksOutputFormat); + break; + default: + return make_error( + "Illegal value for aggregation-type.", + make_error_code(errc::result_out_of_range)); + } + } else { + switch (RequestedAggregation) { + case AggregationType::TOTAL_TIME: + ST.printAllPerThread(outs(), FuncIdHelper, + StacksOutputFormat); + break; + case AggregationType::INVOCATION_COUNT: + ST.printAllPerThread( + outs(), FuncIdHelper, StacksOutputFormat); + break; + default: + return make_error( + "Illegal value for aggregation-type.", + make_error_code(errc::result_out_of_range)); + } + } + return Error::success(); + } + + // We're only outputting top stacks. if (AggregateThreads) { ST.printAggregatingThreads(outs(), FuncIdHelper); } else if (SeparateThreadStacks) {