Index: llvm/trunk/tools/llvm-xray/trie-node.h
===================================================================
--- llvm/trunk/tools/llvm-xray/trie-node.h
+++ llvm/trunk/tools/llvm-xray/trie-node.h
@@ -0,0 +1,92 @@
+//===- trie-node.h - XRay Call Stack Data Structure -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides a data structure and routines for working with call stacks
+// of instrumented functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_XRAY_STACK_TRIE_H
+#define LLVM_TOOLS_LLVM_XRAY_STACK_TRIE_H
+
+#include <forward_list>
+#include <numeric>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+
+/// A type to represent a trie of invocations. It is useful to construct a
+/// graph of these nodes from reading an XRay trace, such that each function
+/// call can be placed in a larger context.
+///
+/// The template parameter allows users of the template to attach their own
+/// data elements to each node in the invocation graph.
+template <typename AssociatedData> struct TrieNode {
+  /// The function ID.
+  int32_t FuncId;
+
+  /// The caller of this function.
+  TrieNode<AssociatedData> *Parent;
+
+  /// The callees from this function.
+  llvm::SmallVector<TrieNode<AssociatedData> *, 4> Callees;
+
+  /// Additional parameterized data on each node.
+  AssociatedData ExtraData;
+};
+
+/// Merges together two TrieNodes with like function ids, aggregating their
+/// callee lists and durations. The caller must provide storage where new merged
+/// nodes can be allocated in the form of a linked list.
+template <typename T, typename Callable>
+TrieNode<T> *
+mergeTrieNodes(const TrieNode<T> &Left, const TrieNode<T> &Right,
+               /*Non-deduced pointer type for nullptr compatibility*/
+               typename std::remove_reference<TrieNode<T> *>::type NewParent,
+               std::forward_list<TrieNode<T>> &NodeStore,
+               Callable &&MergeCallable) {
+  llvm::function_ref<T(const T &, const T &)> MergeFn(
+      std::forward<Callable>(MergeCallable));
+  assert(Left.FuncId == Right.FuncId);
+  NodeStore.push_front(TrieNode<T>{
+      Left.FuncId, NewParent, {}, MergeFn(Left.ExtraData, Right.ExtraData)});
+  auto I = NodeStore.begin();
+  auto *Node = &*I;
+
+  // Build a map of callees from the left side.
+  llvm::DenseMap<int32_t, TrieNode<T> *> LeftCalleesByFuncId;
+  for (auto *Callee : Left.Callees) {
+    LeftCalleesByFuncId[Callee->FuncId] = Callee;
+  }
+
+  // Iterate through the right side, either merging with the map values or
+  // directly adding to the Callees vector. The iteration also removes any
+  // merged values from the left side map.
+  // TODO: Unroll into iterative and explicit stack for efficiency.
+  for (auto *Callee : Right.Callees) {
+    auto iter = LeftCalleesByFuncId.find(Callee->FuncId);
+    if (iter != LeftCalleesByFuncId.end()) {
+      Node->Callees.push_back(
+          mergeTrieNodes(*(iter->second), *Callee, Node, NodeStore, MergeFn));
+      LeftCalleesByFuncId.erase(iter);
+    } else {
+      Node->Callees.push_back(Callee);
+    }
+  }
+
+  // Add any callees that weren't found in the right side.
+  for (auto MapPairIter : LeftCalleesByFuncId) {
+    Node->Callees.push_back(MapPairIter.second);
+  }
+
+  return Node;
+}
+
+#endif // LLVM_TOOLS_LLVM_XRAY_STACK_TRIE_H
Index: llvm/trunk/tools/llvm-xray/xray-converter.h
===================================================================
--- llvm/trunk/tools/llvm-xray/xray-converter.h
+++ llvm/trunk/tools/llvm-xray/xray-converter.h
@@ -15,8 +15,8 @@
 #define LLVM_TOOLS_LLVM_XRAY_XRAY_CONVERTER_H
 
 #include "func-id-helper.h"
-#include "llvm/XRay/XRayRecord.h"
 #include "llvm/XRay/Trace.h"
+#include "llvm/XRay/XRayRecord.h"
 
 namespace llvm {
 namespace xray {
@@ -31,6 +31,11 @@
 
   void exportAsYAML(const Trace &Records, raw_ostream &OS);
   void exportAsRAWv1(const Trace &Records, raw_ostream &OS);
+
+  /// For this conversion, the Function records within each thread are expected
+  /// to be in sorted TSC order. The trace event format encodes stack traces, so
+  /// the linear history is essential for correct output.
+  void exportAsChromeTraceEventFormat(const Trace &Records, raw_ostream &OS);
 };
 
 } // namespace xray
Index: llvm/trunk/tools/llvm-xray/xray-converter.cc
===================================================================
--- llvm/trunk/tools/llvm-xray/xray-converter.cc
+++ llvm/trunk/tools/llvm-xray/xray-converter.cc
@@ -12,10 +12,12 @@
 //===----------------------------------------------------------------------===//
 #include "xray-converter.h"
 
+#include "trie-node.h"
 #include "xray-registry.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
@@ -32,11 +34,14 @@
 static cl::opt<std::string> ConvertInput(cl::Positional,
                                          cl::desc("<xray log file>"),
                                          cl::Required, cl::sub(Convert));
-enum class ConvertFormats { BINARY, YAML };
+enum class ConvertFormats { BINARY, YAML, CHROME_TRACE_EVENT };
 static cl::opt<ConvertFormats> ConvertOutputFormat(
     "output-format", cl::desc("output format"),
     cl::values(clEnumValN(ConvertFormats::BINARY, "raw", "output in binary"),
-               clEnumValN(ConvertFormats::YAML, "yaml", "output in yaml")),
+               clEnumValN(ConvertFormats::YAML, "yaml", "output in yaml"),
+               clEnumValN(ConvertFormats::CHROME_TRACE_EVENT, "trace_event",
+                          "Output in chrome's trace event format. "
+                          "May be visualized with the Catapult trace viewer.")),
     cl::sub(Convert));
 static cl::alias ConvertOutputFormat2("f", cl::aliasopt(ConvertOutputFormat),
                                       cl::desc("Alias for -output-format"),
@@ -142,6 +147,192 @@
   }
 }
 
+namespace {
+
+// A structure that allows building a dictionary of stack ids for the Chrome
+// trace event format.
+struct StackIdData {
+  // Each Stack of function calls has a unique ID.
+  unsigned id;
+
+  // Bookkeeping so that IDs can be maintained uniquely across threads.
+  // Traversal keeps sibling pointers to other threads stacks. This is helpful
+  // to determine when a thread encounters a new stack and should assign a new
+  // unique ID.
+  SmallVector<TrieNode<StackIdData> *, 4> siblings;
+};
+
+using StackTrieNode = TrieNode<StackIdData>;
+
+// A helper function to find the sibling nodes for an encountered function in a
+// thread of execution. Relies on the invariant that each time a new node is
+// traversed in a thread, sibling bidirectional pointers are maintained.
+SmallVector<StackTrieNode *, 4>
+findSiblings(StackTrieNode *parent, int32_t FnId, uint32_t TId,
+             const DenseMap<uint32_t, SmallVector<StackTrieNode *, 4>>
+                 &StackRootsByThreadId) {
+
+  SmallVector<StackTrieNode *, 4> Siblings{};
+
+  if (parent == nullptr) {
+    for (auto map_iter : StackRootsByThreadId) {
+      // Only look for siblings in other threads.
+      if (map_iter.first != TId)
+        for (auto node_iter : map_iter.second) {
+          if (node_iter->FuncId == FnId)
+            Siblings.push_back(node_iter);
+        }
+    }
+    return Siblings;
+  }
+
+  for (auto *ParentSibling : parent->ExtraData.siblings)
+    for (auto node_iter : ParentSibling->Callees)
+      if (node_iter->FuncId == FnId)
+        Siblings.push_back(node_iter);
+
+  return Siblings;
+}
+
+// Given a function being invoked in a thread with id TId, finds and returns the
+// StackTrie representing the function call stack. If no node exists, creates
+// the node. Assigns unique IDs to stacks newly encountered among all threads
+// and keeps sibling links up to when creating new nodes.
+StackTrieNode *findOrCreateStackNode(
+    StackTrieNode *Parent, int32_t FuncId, uint32_t TId,
+    DenseMap<uint32_t, SmallVector<StackTrieNode *, 4>> &StackRootsByThreadId,
+    DenseMap<unsigned, StackTrieNode *> &StacksByStackId, unsigned *id_counter,
+    std::forward_list<StackTrieNode> &NodeStore) {
+  SmallVector<StackTrieNode *, 4> &ParentCallees =
+      Parent == nullptr ? StackRootsByThreadId[TId] : Parent->Callees;
+  auto match = find_if(ParentCallees, [FuncId](StackTrieNode *ParentCallee) {
+    return FuncId == ParentCallee->FuncId;
+  });
+  if (match != ParentCallees.end())
+    return *match;
+
+  SmallVector<StackTrieNode *, 4> siblings =
+      findSiblings(Parent, FuncId, TId, StackRootsByThreadId);
+  if (siblings.empty()) {
+    NodeStore.push_front({FuncId, Parent, {}, {(*id_counter)++, {}}});
+    StackTrieNode *CurrentStack = &NodeStore.front();
+    StacksByStackId[*id_counter - 1] = CurrentStack;
+    ParentCallees.push_back(CurrentStack);
+    return CurrentStack;
+  }
+  unsigned stack_id = siblings[0]->ExtraData.id;
+  NodeStore.push_front({FuncId, Parent, {}, {stack_id, std::move(siblings)}});
+  StackTrieNode *CurrentStack = &NodeStore.front();
+  for (auto *sibling : CurrentStack->ExtraData.siblings)
+    sibling->ExtraData.siblings.push_back(CurrentStack);
+  ParentCallees.push_back(CurrentStack);
+  return CurrentStack;
+}
+
+void writeTraceViewerRecord(raw_ostream &OS, int32_t FuncId, uint32_t TId,
+                            bool Symbolize,
+                            const FuncIdConversionHelper &FuncIdHelper,
+                            double EventTimestampUs,
+                            const StackTrieNode &StackCursor,
+                            StringRef FunctionPhenotype) {
+  OS << "    ";
+  OS << llvm::formatv(
+      R"({ "name" : "{0}", "ph" : "{1}", "tid" : "{2}", "pid" : "1", )"
+      R"("ts" : "{3:f3}", "sf" : "{4}" })",
+      (Symbolize ? FuncIdHelper.SymbolOrNumber(FuncId)
+                 : llvm::to_string(FuncId)),
+      FunctionPhenotype, TId, EventTimestampUs, StackCursor.ExtraData.id);
+}
+
+} // namespace
+
+void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
+                                                    raw_ostream &OS) {
+  const auto &FH = Records.getFileHeader();
+  auto CycleFreq = FH.CycleFrequency;
+
+  unsigned id_counter = 0;
+
+  OS << "{\n  \"traceEvents\": [";
+  DenseMap<uint32_t, StackTrieNode *> StackCursorByThreadId{};
+  DenseMap<uint32_t, SmallVector<StackTrieNode *, 4>> StackRootsByThreadId{};
+  DenseMap<unsigned, StackTrieNode *> StacksByStackId{};
+  std::forward_list<StackTrieNode> NodeStore{};
+  int loop_count = 0;
+  for (const auto &R : Records) {
+    if (loop_count++ == 0)
+      OS << "\n";
+    else
+      OS << ",\n";
+
+    // Chrome trace event format always wants data in micros.
+    // CyclesPerMicro = CycleHertz / 10^6
+    // TSC / CyclesPerMicro == TSC * 10^6 / CycleHertz == MicroTimestamp
+    // Could lose some precision here by converting the TSC to a double to
+    // multiply by the period in micros. 52 bit mantissa is a good start though.
+    // TODO: Make feature request to Chrome Trace viewer to accept ticks and a
+    // frequency or do some more involved calculation to avoid dangers of
+    // conversion.
+    double EventTimestampUs = double(1000000) / CycleFreq * double(R.TSC);
+    StackTrieNode *&StackCursor = StackCursorByThreadId[R.TId];
+    switch (R.Type) {
+    case RecordTypes::ENTER:
+    case RecordTypes::ENTER_ARG:
+      StackCursor = findOrCreateStackNode(StackCursor, R.FuncId, R.TId,
+                                          StackRootsByThreadId, StacksByStackId,
+                                          &id_counter, NodeStore);
+      // Each record is represented as a json dictionary with function name,
+      // type of B for begin or E for end, thread id, process id (faked),
+      // timestamp in microseconds, and a stack frame id. The ids are logged
+      // in an id dictionary after the events.
+      writeTraceViewerRecord(OS, R.FuncId, R.TId, Symbolize, FuncIdHelper,
+                             EventTimestampUs, *StackCursor, "B");
+      break;
+    case RecordTypes::EXIT:
+    case RecordTypes::TAIL_EXIT:
+      // No entries to record end for.
+      if (StackCursor == nullptr)
+        break;
+      // Should we emit an END record anyway or account this condition?
+      // (And/Or in loop termination below)
+      StackTrieNode *PreviousCursor = nullptr;
+      do {
+        writeTraceViewerRecord(OS, StackCursor->FuncId, R.TId, Symbolize,
+                               FuncIdHelper, EventTimestampUs, *StackCursor,
+                               "E");
+        PreviousCursor = StackCursor;
+        StackCursor = StackCursor->Parent;
+      } while (PreviousCursor->FuncId != R.FuncId && StackCursor != nullptr);
+      break;
+    }
+  }
+  OS << "\n  ],\n"; // Close the Trace Events array.
+  OS << "  "
+     << "\"displayTimeUnit\": \"ns\",\n";
+
+  // The stackFrames dictionary substantially reduces size of the output file by
+  // avoiding repeating the entire call stack of function names for each entry.
+  OS << R"(  "stackFrames": {)";
+  int stack_frame_count = 0;
+  for (auto map_iter : StacksByStackId) {
+    if (stack_frame_count++ == 0)
+      OS << "\n";
+    else
+      OS << ",\n";
+    OS << "    ";
+    OS << llvm::formatv(
+        R"("{0}" : { "name" : "{1}")", map_iter.first,
+        (Symbolize ? FuncIdHelper.SymbolOrNumber(map_iter.second->FuncId)
+                   : llvm::to_string(map_iter.second->FuncId)));
+    if (map_iter.second->Parent != nullptr)
+      OS << llvm::formatv(R"(, "parent": "{0}")",
+                          map_iter.second->Parent->ExtraData.id);
+    OS << " }";
+  }
+  OS << "\n  }\n"; // Close the stack frames map.
+  OS << "}\n";     // Close the JSON entry.
+}
+
 namespace llvm {
 namespace xray {
 
@@ -191,6 +382,9 @@
   case ConvertFormats::BINARY:
     TC.exportAsRAWv1(T, OS);
     break;
+  case ConvertFormats::CHROME_TRACE_EVENT:
+    TC.exportAsChromeTraceEventFormat(T, OS);
+    break;
   }
   return Error::success();
 });
Index: llvm/trunk/tools/llvm-xray/xray-stacks.cc
===================================================================
--- llvm/trunk/tools/llvm-xray/xray-stacks.cc
+++ llvm/trunk/tools/llvm-xray/xray-stacks.cc
@@ -19,6 +19,7 @@
 #include <numeric>
 
 #include "func-id-helper.h"
+#include "trie-node.h"
 #include "xray-registry.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
@@ -255,96 +256,61 @@
 /// maintain an index of unique functions, and provide a means of iterating
 /// through all the instrumented call stacks which we know about.
 
-struct TrieNode {
-  int32_t FuncId;
-  TrieNode *Parent;
-  SmallVector<TrieNode *, 4> Callees;
-  // Separate durations depending on whether the node is the deepest node in the
-  // stack.
-  SmallVector<int64_t, 4> TerminalDurations;
-  SmallVector<int64_t, 4> IntermediateDurations;
+struct StackDuration {
+  llvm::SmallVector<int64_t, 4> TerminalDurations;
+  llvm::SmallVector<int64_t, 4> IntermediateDurations;
 };
 
-/// Merges together two TrieNodes with like function ids, aggregating their
-/// callee lists and durations. The caller must provide storage where new merged
-/// nodes can be allocated in the form of a linked list.
-TrieNode *mergeTrieNodes(const TrieNode &Left, const TrieNode &Right,
-                         TrieNode *NewParent,
-                         std::forward_list<TrieNode> &NodeStore) {
-  assert(Left.FuncId == Right.FuncId);
-  NodeStore.push_front(TrieNode{Left.FuncId, NewParent, {}, {}, {}});
-  auto I = NodeStore.begin();
-  auto *Node = &*I;
-
-  // Build a map of callees from the left side.
-  DenseMap<int32_t, TrieNode *> LeftCalleesByFuncId;
-  for (auto *Callee : Left.Callees) {
-    LeftCalleesByFuncId[Callee->FuncId] = Callee;
-  }
-
-  // Iterate through the right side, either merging with the map values or
-  // directly adding to the Callees vector. The iteration also removes any
-  // merged values from the left side map.
-  for (auto *Callee : Right.Callees) {
-    auto iter = LeftCalleesByFuncId.find(Callee->FuncId);
-    if (iter != LeftCalleesByFuncId.end()) {
-      Node->Callees.push_back(
-          mergeTrieNodes(*(iter->second), *Callee, Node, NodeStore));
-      LeftCalleesByFuncId.erase(iter);
-    } else {
-      Node->Callees.push_back(Callee);
-    }
-  }
-
-  // Add any callees that weren't found in the right side.
-  for (auto MapPairIter : LeftCalleesByFuncId) {
-    Node->Callees.push_back(MapPairIter.second);
-  }
-
+StackDuration mergeStackDuration(const StackDuration &Left,
+                                 const StackDuration &Right) {
+  StackDuration Data{};
+  Data.TerminalDurations.reserve(Left.TerminalDurations.size() +
+                                 Right.TerminalDurations.size());
+  Data.IntermediateDurations.reserve(Left.IntermediateDurations.size() +
+                                     Right.IntermediateDurations.size());
   // Aggregate the durations.
-  for (auto duration : Left.TerminalDurations) {
-    Node->TerminalDurations.push_back(duration);
-  }
-  for (auto duration : Right.TerminalDurations) {
-    Node->TerminalDurations.push_back(duration);
-  }
-  for (auto duration : Left.IntermediateDurations) {
-    Node->IntermediateDurations.push_back(duration);
-  }
-  for (auto duration : Right.IntermediateDurations) {
-    Node->IntermediateDurations.push_back(duration);
-  }
-
-  return Node;
+  for (auto duration : Left.TerminalDurations)
+    Data.TerminalDurations.push_back(duration);
+  for (auto duration : Right.TerminalDurations)
+    Data.TerminalDurations.push_back(duration);
+
+  for (auto duration : Left.IntermediateDurations)
+    Data.IntermediateDurations.push_back(duration);
+  for (auto duration : Right.IntermediateDurations)
+    Data.IntermediateDurations.push_back(duration);
+  return Data;
 }
 
+using StackTrieNode = TrieNode<StackDuration>;
+
 template <AggregationType AggType>
-std::size_t GetValueForStack(const TrieNode *Node);
+std::size_t GetValueForStack(const StackTrieNode *Node);
 
 // When computing total time spent in a stack, we're adding the timings from
 // its callees and the timings from when it was a leaf.
 template <>
 std::size_t
-GetValueForStack<AggregationType::TOTAL_TIME>(const TrieNode *Node) {
-  auto TopSum = std::accumulate(Node->TerminalDurations.begin(),
-                                Node->TerminalDurations.end(), 0uLL);
-  return std::accumulate(Node->IntermediateDurations.begin(),
-                         Node->IntermediateDurations.end(), TopSum);
+GetValueForStack<AggregationType::TOTAL_TIME>(const StackTrieNode *Node) {
+  auto TopSum = std::accumulate(Node->ExtraData.TerminalDurations.begin(),
+                                Node->ExtraData.TerminalDurations.end(), 0uLL);
+  return std::accumulate(Node->ExtraData.IntermediateDurations.begin(),
+                         Node->ExtraData.IntermediateDurations.end(), TopSum);
 }
 
 // Calculates how many times a function was invoked.
 // TODO: Hook up option to produce stacks
 template <>
 std::size_t
-GetValueForStack<AggregationType::INVOCATION_COUNT>(const TrieNode *Node) {
-  return Node->TerminalDurations.size() + Node->IntermediateDurations.size();
+GetValueForStack<AggregationType::INVOCATION_COUNT>(const StackTrieNode *Node) {
+  return Node->ExtraData.TerminalDurations.size() +
+         Node->ExtraData.IntermediateDurations.size();
 }
 
 // Make sure there are implementations for each enum value.
 template <AggregationType T> struct DependentFalseType : std::false_type {};
 
 template <AggregationType AggType>
-std::size_t GetValueForStack(const TrieNode *Node) {
+std::size_t GetValueForStack(const StackTrieNode *Node) {
   static_assert(DependentFalseType<AggType>::value,
                 "No implementation found for aggregation type provided.");
   return 0;
@@ -353,21 +319,21 @@
 class StackTrie {
   // Avoid the magic number of 4 propagated through the code with an alias.
   // We use this SmallVector to track the root nodes in a call graph.
-  using RootVector = SmallVector<TrieNode *, 4>;
+  using RootVector = SmallVector<StackTrieNode *, 4>;
 
   // We maintain pointers to the roots of the tries we see.
   DenseMap<uint32_t, RootVector> Roots;
 
   // We make sure all the nodes are accounted for in this list.
-  std::forward_list<TrieNode> NodeStore;
+  std::forward_list<StackTrieNode> NodeStore;
 
   // A map of thread ids to pairs call stack trie nodes and their start times.
-  DenseMap<uint32_t, SmallVector<std::pair<TrieNode *, uint64_t>, 8>>
+  DenseMap<uint32_t, SmallVector<std::pair<StackTrieNode *, uint64_t>, 8>>
       ThreadStackMap;
 
-  TrieNode *createTrieNode(uint32_t ThreadId, int32_t FuncId,
-                           TrieNode *Parent) {
-    NodeStore.push_front(TrieNode{FuncId, Parent, {}, {}, {}});
+  StackTrieNode *createTrieNode(uint32_t ThreadId, int32_t FuncId,
+                                StackTrieNode *Parent) {
+    NodeStore.push_front(StackTrieNode{FuncId, Parent, {}, {{}, {}}});
     auto I = NodeStore.begin();
     auto *Node = &*I;
     if (!Parent)
@@ -375,10 +341,10 @@
     return Node;
   }
 
-  TrieNode *findRootNode(uint32_t ThreadId, int32_t FuncId) {
+  StackTrieNode *findRootNode(uint32_t ThreadId, int32_t FuncId) {
     const auto &RootsByThread = Roots[ThreadId];
     auto I = find_if(RootsByThread,
-                     [&](TrieNode *N) { return N->FuncId == FuncId; });
+                     [&](StackTrieNode *N) { return N->FuncId == FuncId; });
     return (I == RootsByThread.end()) ? nullptr : *I;
   }
 
@@ -416,7 +382,7 @@
 
       auto &Top = TS.back();
       auto I = find_if(Top.first->Callees,
-                       [&](TrieNode *N) { return N->FuncId == R.FuncId; });
+                       [&](StackTrieNode *N) { return N->FuncId == R.FuncId; });
       if (I == Top.first->Callees.end()) {
         // We didn't find the callee in the stack trie, so we're going to
         // add to the stack then set up the pointers properly.
@@ -447,8 +413,8 @@
         return AccountRecordStatus::ENTRY_NOT_FOUND;
       }
 
-      auto FunctionEntryMatch =
-          find_if(reverse(TS), [&](const std::pair<TrieNode *, uint64_t> &E) {
+      auto FunctionEntryMatch = find_if(
+          reverse(TS), [&](const std::pair<StackTrieNode *, uint64_t> &E) {
             return E.first->FuncId == R.FuncId;
           });
       auto status = AccountRecordStatus::OK;
@@ -461,14 +427,14 @@
       }
       auto I = FunctionEntryMatch.base();
       for (auto &E : make_range(I, TS.end() - 1))
-        E.first->IntermediateDurations.push_back(std::max(E.second, R.TSC) -
-                                                 std::min(E.second, R.TSC));
+        E.first->ExtraData.IntermediateDurations.push_back(
+            std::max(E.second, R.TSC) - std::min(E.second, R.TSC));
       auto &Deepest = TS.back();
       if (wasLastRecordExit)
-        Deepest.first->IntermediateDurations.push_back(
+        Deepest.first->ExtraData.IntermediateDurations.push_back(
             std::max(Deepest.second, R.TSC) - std::min(Deepest.second, R.TSC));
       else
-        Deepest.first->TerminalDurations.push_back(
+        Deepest.first->ExtraData.TerminalDurations.push_back(
             std::max(Deepest.second, R.TSC) - std::min(Deepest.second, R.TSC));
       TS.erase(I, TS.end());
       return status;
@@ -479,11 +445,11 @@
 
   bool isEmpty() const { return Roots.empty(); }
 
-  void printStack(raw_ostream &OS, const TrieNode *Top,
+  void printStack(raw_ostream &OS, const StackTrieNode *Top,
                   FuncIdConversionHelper &FN) {
     // Traverse the pointers up to the parent, noting the sums, then print
     // in reverse order (callers at top, callees down bottom).
-    SmallVector<const TrieNode *, 8> CurrentStack;
+    SmallVector<const StackTrieNode *, 8> CurrentStack;
     for (auto *F = Top; F != nullptr; F = F->Parent)
       CurrentStack.push_back(F);
     int Level = 0;
@@ -491,21 +457,22 @@
                   "count", "sum");
     for (auto *F :
          reverse(make_range(CurrentStack.begin() + 1, CurrentStack.end()))) {
-      auto Sum = std::accumulate(F->IntermediateDurations.begin(),
-                                 F->IntermediateDurations.end(), 0LL);
+      auto Sum = std::accumulate(F->ExtraData.IntermediateDurations.begin(),
+                                 F->ExtraData.IntermediateDurations.end(), 0LL);
       auto FuncId = FN.SymbolOrNumber(F->FuncId);
       OS << formatv("#{0,-4} {1,-60} {2,+12} {3,+16}\n", Level++,
                     FuncId.size() > 60 ? FuncId.substr(0, 57) + "..." : FuncId,
-                    F->IntermediateDurations.size(), Sum);
+                    F->ExtraData.IntermediateDurations.size(), Sum);
     }
     auto *Leaf = *CurrentStack.begin();
-    auto LeafSum = std::accumulate(Leaf->TerminalDurations.begin(),
-                                   Leaf->TerminalDurations.end(), 0LL);
+    auto LeafSum =
+        std::accumulate(Leaf->ExtraData.TerminalDurations.begin(),
+                        Leaf->ExtraData.TerminalDurations.end(), 0LL);
     auto LeafFuncId = FN.SymbolOrNumber(Leaf->FuncId);
     OS << formatv("#{0,-4} {1,-60} {2,+12} {3,+16}\n", Level++,
                   LeafFuncId.size() > 60 ? LeafFuncId.substr(0, 57) + "..."
                                          : LeafFuncId,
-                  Leaf->TerminalDurations.size(), LeafSum);
+                  Leaf->ExtraData.TerminalDurations.size(), LeafSum);
     OS << "\n";
   }
 
@@ -552,20 +519,20 @@
 
   /// Creates a merged list of Tries for unique stacks that disregards their
   /// thread IDs.
-  RootVector mergeAcrossThreads(std::forward_list<TrieNode> &NodeStore) {
+  RootVector mergeAcrossThreads(std::forward_list<StackTrieNode> &NodeStore) {
     RootVector MergedByThreadRoots;
     for (auto MapIter : Roots) {
       const auto &RootNodeVector = MapIter.second;
       for (auto *Node : RootNodeVector) {
         auto MaybeFoundIter =
-            find_if(MergedByThreadRoots, [Node](TrieNode *elem) {
+            find_if(MergedByThreadRoots, [Node](StackTrieNode *elem) {
               return Node->FuncId == elem->FuncId;
             });
         if (MaybeFoundIter == MergedByThreadRoots.end()) {
           MergedByThreadRoots.push_back(Node);
         } else {
-          MergedByThreadRoots.push_back(
-              mergeTrieNodes(**MaybeFoundIter, *Node, nullptr, NodeStore));
+          MergedByThreadRoots.push_back(mergeTrieNodes(
+              **MaybeFoundIter, *Node, nullptr, NodeStore, mergeStackDuration));
           MergedByThreadRoots.erase(MaybeFoundIter);
         }
       }
@@ -577,7 +544,7 @@
   template <AggregationType AggType>
   void printAllAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN,
                                   StackOutputFormat format) {
-    std::forward_list<TrieNode> AggregatedNodeStore;
+    std::forward_list<StackTrieNode> AggregatedNodeStore;
     RootVector MergedByThreadRoots = mergeAcrossThreads(AggregatedNodeStore);
     bool reportThreadId = false;
     printAll<AggType>(OS, FN, MergedByThreadRoots,
@@ -586,7 +553,7 @@
 
   /// Merges the trie by thread id before printing top stacks.
   void printAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN) {
-    std::forward_list<TrieNode> AggregatedNodeStore;
+    std::forward_list<StackTrieNode> AggregatedNodeStore;
     RootVector MergedByThreadRoots = mergeAcrossThreads(AggregatedNodeStore);
     print(OS, FN, MergedByThreadRoots);
   }
@@ -595,7 +562,7 @@
   template <AggregationType AggType>
   void printAll(raw_ostream &OS, FuncIdConversionHelper &FN,
                 RootVector RootValues, uint32_t ThreadId, bool ReportThread) {
-    SmallVector<const TrieNode *, 16> S;
+    SmallVector<const StackTrieNode *, 16> S;
     for (const auto *N : RootValues) {
       S.clear();
       S.push_back(N);
@@ -616,10 +583,10 @@
   template <AggregationType AggType>
   void printSingleStack(raw_ostream &OS, FuncIdConversionHelper &Converter,
                         bool ReportThread, uint32_t ThreadId,
-                        const TrieNode *Node) {
+                        const StackTrieNode *Node) {
     if (ReportThread)
       OS << "thread_" << ThreadId << ";";
-    SmallVector<const TrieNode *, 5> lineage{};
+    SmallVector<const StackTrieNode *, 5> lineage{};
     lineage.push_back(Node);
     while (lineage.back()->Parent != nullptr)
       lineage.push_back(lineage.back()->Parent);
@@ -639,15 +606,17 @@
     //   - Total number of unique stacks
     //   - Top 10 stacks by count
     //   - Top 10 stacks by aggregate duration
-    SmallVector<std::pair<const TrieNode *, uint64_t>, 11> TopStacksByCount;
-    SmallVector<std::pair<const TrieNode *, uint64_t>, 11> TopStacksBySum;
-    auto greater_second = [](const std::pair<const TrieNode *, uint64_t> &A,
-                             const std::pair<const TrieNode *, uint64_t> &B) {
-      return A.second > B.second;
-    };
+    SmallVector<std::pair<const StackTrieNode *, uint64_t>, 11>
+        TopStacksByCount;
+    SmallVector<std::pair<const StackTrieNode *, uint64_t>, 11> TopStacksBySum;
+    auto greater_second =
+        [](const std::pair<const StackTrieNode *, uint64_t> &A,
+           const std::pair<const StackTrieNode *, uint64_t> &B) {
+          return A.second > B.second;
+        };
     uint64_t UniqueStacks = 0;
     for (const auto *N : RootValues) {
-      SmallVector<const TrieNode *, 16> S;
+      SmallVector<const StackTrieNode *, 16> S;
       S.emplace_back(N);
 
       while (!S.empty()) {
@@ -655,10 +624,11 @@
 
         // We only start printing the stack (by walking up the parent pointers)
         // when we get to a leaf function.
-        if (!Top->TerminalDurations.empty()) {
+        if (!Top->ExtraData.TerminalDurations.empty()) {
           ++UniqueStacks;
-          auto TopSum = std::accumulate(Top->TerminalDurations.begin(),
-                                        Top->TerminalDurations.end(), 0uLL);
+          auto TopSum =
+              std::accumulate(Top->ExtraData.TerminalDurations.begin(),
+                              Top->ExtraData.TerminalDurations.end(), 0uLL);
           {
             auto E = std::make_pair(Top, TopSum);
             TopStacksBySum.insert(std::lower_bound(TopStacksBySum.begin(),
@@ -669,7 +639,8 @@
               TopStacksBySum.pop_back();
           }
           {
-            auto E = std::make_pair(Top, Top->TerminalDurations.size());
+            auto E =
+                std::make_pair(Top, Top->ExtraData.TerminalDurations.size());
             TopStacksByCount.insert(std::lower_bound(TopStacksByCount.begin(),
                                                      TopStacksByCount.end(), E,
                                                      greater_second),