Index: compiler-rt/lib/xray/tests/CMakeLists.txt
===================================================================
--- compiler-rt/lib/xray/tests/CMakeLists.txt
+++ compiler-rt/lib/xray/tests/CMakeLists.txt
@@ -68,7 +68,11 @@
 endfunction()
 
 set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH})
-set(XRAY_UNITTEST_LINK_FLAGS ${CMAKE_THREAD_LIBS_INIT})
+set(XRAY_UNITTEST_LINK_FLAGS
+  ${CMAKE_THREAD_LIBS_INIT}
+  -fxray-instrument
+  -lstdc++  # TODO: Detect the correct standard library used!
+  )
 if (NOT APPLE)
   append_list_if(COMPILER_RT_HAS_LIBM -lm XRAY_UNITTEST_LINK_FLAGS)
   append_list_if(COMPILER_RT_HAS_LIBRT -lrt XRAY_UNITTEST_LINK_FLAGS)
@@ -94,7 +98,7 @@
         RUNTIME "${XRAY_RUNTIME_LIBS}"
         DEPS gtest xray
         CFLAGS ${XRAY_UNITTEST_CFLAGS}
-        LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS} -lstdc++)
+        LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS})
       set_target_properties(XRayUnitTests
         PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endforeach()
Index: compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc
===================================================================
--- compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc
+++ compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc
@@ -18,12 +18,6 @@
 
 namespace {
 
-TEST(FunctionCallTrieTest, Construction) {
-  // We want to make sure that we can create one of these without the set of
-  // allocators we need. This will by default use the global allocators.
-  FunctionCallTrie Trie;
-}
-
 TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) {
   // FIXME: Support passing in configuration for allocators in the allocator
   // constructors.
@@ -61,6 +55,17 @@
   ASSERT_TRUE(R.empty());
 }
 
+TEST(FunctionCallTrieTest, NoMatchingEntersForExit) {
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+  Trie.enterFunction(2, 1);
+  Trie.enterFunction(3, 3);
+  Trie.exitFunction(1, 5);
+  const auto &R = Trie.getRoots();
+
+  ASSERT_TRUE(R.empty());
+}
+
 TEST(FunctionCallTrieTest, MissingFunctionExit) {
   auto A = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(A);
@@ -153,6 +158,30 @@
   EXPECT_EQ(F1.CumulativeLocalTime, 100);
 }
 
+TEST(FunctionCallTrieTest, DeepCallStack) {
+  // Simulate a relatively deep call stack (32 levels) and ensure that we can
+  // properly pop all the way up the stack.
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+  for (int i = 0; i < 32; ++i)
+    Trie.enterFunction(i + 1, i);
+  Trie.exitFunction(1, 33);
+
+  // Here, validate that we have a 32-level deep function call path from the
+  // root (1) down to the leaf (33).
+  const auto &R = Trie.getRoots();
+  ASSERT_EQ(R.size(), 1u);
+  auto F = R[0];
+  for (int i = 0; i < 32; ++i) {
+    EXPECT_EQ(F->FId, i + 1);
+    if (F->Callees.empty() && i != 31)
+      FAIL() << "Empty callees for FId " << F->FId;
+    if (i != 31)
+      F = F->Callees[0].NodePtr;
+  }
+}
+
 // TODO: Test that we can handle cross-CPU migrations, where TSCs are not
 // guaranteed to be synchronised.
 TEST(FunctionCallTrieTest, DeepCopy) {
Index: compiler-rt/lib/xray/tests/unit/segmented_array_test.cc
===================================================================
--- compiler-rt/lib/xray/tests/unit/segmented_array_test.cc
+++ compiler-rt/lib/xray/tests/unit/segmented_array_test.cc
@@ -107,32 +107,84 @@
 }
 
 TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
-  Array<TestData> data;
-  ASSERT_TRUE(data.empty());
-  auto I0Begin = data.begin(), I0End = data.end();
-  // Add enough elements in data to have more than one chunk.
-  constexpr auto ChunkX2 = Array<TestData>::ChunkSize * 2;
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 10, 0);
+  Array<TestData> Data(A);
+  ASSERT_TRUE(Data.empty());
+  auto I0Begin = Data.begin(), I0End = Data.end();
+  // Add enough elements in Data to have more than one chunk.
+  constexpr auto Chunk = Array<TestData>::ChunkSize;
+  constexpr auto ChunkX2 = Chunk * 2;
   for (auto i = ChunkX2; i > 0u; --i) {
-    data.Append({static_cast<s64>(i), static_cast<s64>(i)});
+    Data.AppendEmplace(static_cast<s64>(i), static_cast<s64>(i));
   }
-  ASSERT_EQ(data.size(), ChunkX2);
+  ASSERT_EQ(Data.size(), ChunkX2);
+  {
+    auto &Back = Data.back();
+    ASSERT_EQ(Back.First, 1);
+    ASSERT_EQ(Back.Second, 1);
+  }
+
   // Trim one chunk's elements worth.
-  data.trim(ChunkX2 / 2);
-  ASSERT_EQ(data.size(), ChunkX2 / 2);
+  Data.trim(ChunkX2 / 2);
+  ASSERT_EQ(Data.size(), ChunkX2 / 2);
+
+  // Check that we are still able to access 'back' properly.
+  {
+    auto &Back = Data.back();
+    ASSERT_EQ(Back.First, static_cast<s64>(Chunk + 1));
+    ASSERT_EQ(Back.Second, static_cast<s64>(Chunk + 1));
+  }
+
   // Then trim until it's empty.
-  data.trim(ChunkX2 / 2);
-  ASSERT_TRUE(data.empty());
+  Data.trim(ChunkX2 / 2);
+  ASSERT_TRUE(Data.empty());
 
   // Here our iterators should be the same.
-  auto I1Begin = data.begin(), I1End = data.end();
+  auto I1Begin = Data.begin(), I1End = Data.end();
   EXPECT_EQ(I0Begin, I1Begin);
   EXPECT_EQ(I0End, I1End);
 
   // Then we ensure that adding elements back works just fine.
   for (auto i = ChunkX2; i > 0u; --i) {
-    data.Append({static_cast<s64>(i), static_cast<s64>(i)});
+    Data.Append({static_cast<s64>(i), static_cast<s64>(i)});
+  }
+  EXPECT_EQ(Data.size(), ChunkX2);
+}
+
+struct ShadowStackEntry {
+  uint64_t EntryTSC = 0;
+  uint64_t *NodePtr = nullptr;
+  ShadowStackEntry(uint64_t T, uint64_t *N) : EntryTSC(T), NodePtr(N) {}
+};
+
+TEST(SegmentedArrayTest, SimulateStackBehaviour) {
+  using AllocatorType = typename Array<ShadowStackEntry>::AllocatorType;
+  AllocatorType A(1 << 10, 0);
+  Array<ShadowStackEntry> Data(A);
+  static uint64_t Dummy = 0;
+  constexpr uint64_t Max = 9;
+
+  for (uint64_t i = 0; i < Max; ++i) {
+    auto P = Data.Append({i, &Dummy});
+    ASSERT_NE(P, nullptr);
+    ASSERT_EQ(P->NodePtr, &Dummy);
+    auto &Back = Data.back();
+    ASSERT_EQ(Back.NodePtr, &Dummy);
+    ASSERT_EQ(Back.EntryTSC, i);
+  }
+
+  // Simulate a stack by checking the data from the end as we're trimming.
+  auto Counter = Max;
+  ASSERT_EQ(Data.size(), size_t(Max));
+  while (!Data.empty()) {
+    const auto &Top = Data.back();
+    uint64_t *TopNode = Top.NodePtr;
+    EXPECT_EQ(TopNode, &Dummy) << "Counter = " << Counter;
+    Data.trim(1);
+    --Counter;
+    ASSERT_EQ(Data.size(), size_t(Counter));
   }
-  EXPECT_EQ(data.size(), ChunkX2);
 }
 
 } // namespace
Index: compiler-rt/lib/xray/xray_allocator.h
===================================================================
--- compiler-rt/lib/xray/xray_allocator.h
+++ compiler-rt/lib/xray/xray_allocator.h
@@ -18,12 +18,12 @@
 
 #include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_mutex.h"
+#include "xray_utils.h"
 #include <cstddef>
 #include <cstdint>
 
-#include "sanitizer_common/sanitizer_internal_defs.h"
-
 namespace __xray {
 
 /// The Allocator type hands out fixed-sized chunks of memory that are
@@ -40,8 +40,7 @@
   // The Allocator returns memory as Block instances.
   struct Block {
     /// Compute the minimum cache-line size multiple that is >= N.
-    static constexpr auto Size =
-        kCacheLineSize * ((N / kCacheLineSize) + (N % kCacheLineSize ? 1 : 0));
+    static constexpr auto Size = nearest_boundary(N, kCacheLineSize);
     void *Data = nullptr;
   };
 
@@ -61,15 +60,16 @@
 
     // We compute the number of pointers to areas in memory where we consider as
     // individual blocks we've allocated. To ensure that instances of the
-    // BlockLink object are cache-line sized, we deduct one additional
-    // pointers worth representing the pointer to the previous link.
+    // BlockLink object are cache-line sized, we deduct two pointers worth
+    // representing the pointer to the previous link and the backing store for
+    // the whole block.
     //
     // This structure corresponds to the following layout:
     //
-    //   Blocks [ 0, 1, 2, .., BlockPtrCount - 1]
+    //   Blocks [ 0, 1, 2, .., BlockPtrCount - 2]
     //
     static constexpr auto BlockPtrCount =
-        (kCacheLineSize / sizeof(Block *)) - 1;
+        (kCacheLineSize / sizeof(Block *)) - 2;
 
     BlockLink() {
       // Zero out Blocks.
@@ -81,6 +81,7 @@
     // FIXME: Align this to cache-line address boundaries?
     Block Blocks[BlockPtrCount];
     BlockLink *Prev = nullptr;
+    void *BackingStore = nullptr;
   };
 
   static_assert(sizeof(BlockLink) == kCacheLineSize,
@@ -96,17 +97,26 @@
   BlockLink *Tail = &NullLink;
   size_t Counter = 0;
 
-  BlockLink *NewChainLink() {
+  BlockLink *NewChainLink(uint64_t Alignment) {
     auto NewChain = reinterpret_cast<BlockLink *>(
         InternalAlloc(sizeof(BlockLink), nullptr, kCacheLineSize));
     auto BackingStore = reinterpret_cast<char *>(InternalAlloc(
-        BlockLink::BlockPtrCount * Block::Size, nullptr, kCacheLineSize));
+        (BlockLink::BlockPtrCount + 1) * Block::Size, nullptr, Alignment));
     size_t Offset = 0;
     DCHECK_NE(NewChain, nullptr);
     DCHECK_NE(BackingStore, nullptr);
+    NewChain->BackingStore = BackingStore;
+
+    // Here we ensure that the alignment of the pointers we're handing out
+    // adhere to the alignment requirements of the call to Allocate().
     for (auto &B : NewChain->Blocks) {
-      B.Data = BackingStore + Offset;
-      Offset += Block::Size;
+      auto AlignmentAdjustment =
+          nearest_boundary(reinterpret_cast<uintptr_t>(BackingStore + Offset),
+                           Alignment) -
+          reinterpret_cast<uintptr_t>(BackingStore + Offset);
+      B.Data = BackingStore + AlignmentAdjustment + Offset;
+      DCHECK_EQ(reinterpret_cast<uintptr_t>(B.Data) % Alignment, 0);
+      Offset += AlignmentAdjustment + Block::Size;
     }
     NewChain->Prev = Tail;
     return NewChain;
@@ -117,7 +127,7 @@
     // FIXME: Implement PreAllocate support!
   }
 
-  Block Allocate() {
+  Block Allocate(uint64_t Alignment = 8) {
     SpinMutexLock Lock(&Mutex);
     // Check whether we're over quota.
     if (Counter * Block::Size >= MaxMemory)
@@ -128,7 +138,7 @@
     Block B{};
     BlockLink *Link = Tail;
     if (UNLIKELY(Counter == 0 || ChainOffset == 0))
-      Tail = Link = NewChainLink();
+      Tail = Link = NewChainLink(Alignment);
 
     B = Link->Blocks[ChainOffset];
     ++Counter;
@@ -140,7 +150,7 @@
     for (auto *C = Tail; C != &NullLink;) {
       // We know that the data block is a large contiguous page, we deallocate
       // that at once.
-      InternalFree(C->Blocks[0].Data);
+      InternalFree(C->BackingStore);
       auto Prev = C->Prev;
       InternalFree(C);
       C = Prev;
Index: compiler-rt/lib/xray/xray_function_call_trie.h
===================================================================
--- compiler-rt/lib/xray/xray_function_call_trie.h
+++ compiler-rt/lib/xray/xray_function_call_trie.h
@@ -17,8 +17,8 @@
 
 #include "xray_profiling_flags.h"
 #include "xray_segmented_array.h"
+#include <memory> // For placement new.
 #include <utility>
-#include <memory>  // For placement new.
 
 namespace __xray {
 
@@ -101,6 +101,8 @@
     NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {}
   };
 
+  static_assert(sizeof(NodeIdPair) == 16, "");
+
   using NodeIdPairArray = Array<NodeIdPair>;
   using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType;
 
@@ -128,15 +130,12 @@
 
 private:
   struct ShadowStackEntry {
-    int32_t FId; // We're copying the function ID into the stack to avoid having
-                 // to reach into the node just to get the function ID.
     uint64_t EntryTSC;
     Node *NodePtr;
 
     // We add a constructor here to allow us to inplace-construct through
     // Array<...>'s AppendEmplace.
-    ShadowStackEntry(int32_t F, uint64_t T, Node *N)
-        : FId(F), EntryTSC(T), NodePtr(N) {}
+    ShadowStackEntry(uint64_t T, Node *N) : EntryTSC{T}, NodePtr{N} {}
   };
 
   using NodeArray = Array<Node>;
@@ -219,30 +218,30 @@
 
   // TODO: Support configuration of options through the arguments.
   static Allocators InitAllocators() {
+    return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max);
+  }
+
+  static Allocators InitAllocatorsCustom(uptr Max) {
     Allocators A;
     auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>(
         InternalAlloc(sizeof(Allocators::NodeAllocatorType)));
-    new (NodeAllocator) Allocators::NodeAllocatorType(
-        profilingFlags()->per_thread_allocator_max, 0);
+    new (NodeAllocator) Allocators::NodeAllocatorType(Max, 0);
     A.NodeAllocator = NodeAllocator;
 
     auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>(
         InternalAlloc(sizeof(Allocators::RootAllocatorType)));
-    new (RootAllocator) Allocators::RootAllocatorType(
-        profilingFlags()->per_thread_allocator_max, 0);
+    new (RootAllocator) Allocators::RootAllocatorType(Max, 0);
     A.RootAllocator = RootAllocator;
 
     auto ShadowStackAllocator =
         reinterpret_cast<Allocators::ShadowStackAllocatorType *>(
             InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType)));
-    new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(
-        profilingFlags()->per_thread_allocator_max, 0);
+    new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max, 0);
     A.ShadowStackAllocator = ShadowStackAllocator;
 
     auto NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
         InternalAlloc(sizeof(NodeIdPairAllocatorType)));
-    new (NodeIdPairAllocator)
-        NodeIdPairAllocatorType(profilingFlags()->per_thread_allocator_max, 0);
+    new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max, 0);
     A.NodeIdPairAllocator = NodeIdPairAllocator;
     return A;
   }
@@ -253,20 +252,14 @@
   ShadowStackArray ShadowStack;
   NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
 
-  const Allocators &GetGlobalAllocators() {
-    static const Allocators A = [] { return InitAllocators(); }();
-    return A;
-  }
-
 public:
   explicit FunctionCallTrie(const Allocators &A)
       : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
         ShadowStack(*A.ShadowStackAllocator),
         NodeIdPairAllocator(A.NodeIdPairAllocator) {}
 
-  FunctionCallTrie() : FunctionCallTrie(GetGlobalAllocators()) {}
-
-  void enterFunction(int32_t FId, uint64_t TSC) {
+  void enterFunction(const int32_t FId, uint64_t TSC) {
+    DCHECK_NE(FId, 0);
     // This function primarily deals with ensuring that the ShadowStack is
     // consistent and ready for when an exit event is encountered.
     if (UNLIKELY(ShadowStack.empty())) {
@@ -275,12 +268,13 @@
       if (UNLIKELY(NewRoot == nullptr))
         return;
       Roots.Append(NewRoot);
-      ShadowStack.AppendEmplace(FId, TSC, NewRoot);
+      ShadowStack.AppendEmplace(TSC, NewRoot);
       return;
     }
 
     auto &Top = ShadowStack.back();
     auto TopNode = Top.NodePtr;
+    DCHECK_NE(TopNode, nullptr);
 
     // If we've seen this callee before, then we just access that node and place
     // that on the top of the stack.
@@ -288,7 +282,7 @@
         [FId](const NodeIdPair &NR) { return NR.FId == FId; });
     if (Callee != nullptr) {
       CHECK_NE(Callee->NodePtr, nullptr);
-      ShadowStack.AppendEmplace(FId, TSC, Callee->NodePtr);
+      ShadowStack.AppendEmplace(TSC, Callee->NodePtr);
       return;
     }
 
@@ -297,8 +291,10 @@
         Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
     if (UNLIKELY(NewNode == nullptr))
       return;
+    DCHECK_NE(NewNode, nullptr);
     TopNode->Callees.AppendEmplace(NewNode, FId);
-    ShadowStack.AppendEmplace(FId, TSC, NewNode);
+    ShadowStack.AppendEmplace(TSC, NewNode);
+    DCHECK_NE(ShadowStack.back().NodePtr, nullptr);
     return;
   }
 
@@ -307,14 +303,11 @@
     // entered this function before. We do as little processing here as we can,
     // since most of the hard work would have already been done at function
     // entry.
-    if (UNLIKELY(ShadowStack.empty()))
-      return;
-
     uint64_t CumulativeTreeTime = 0;
     while (!ShadowStack.empty()) {
-      auto &Top = ShadowStack.back();
+      const auto &Top = ShadowStack.back();
       auto TopNode = Top.NodePtr;
-      auto TopFId = TopNode->FId;
+      DCHECK_NE(TopNode, nullptr);
       auto LocalTime = TSC - Top.EntryTSC;
       TopNode->CallCount++;
       TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
@@ -322,7 +315,7 @@
       ShadowStack.trim(1);
 
       // TODO: Update the histogram for the node.
-      if (TopFId == FId)
+      if (TopNode->FId == FId)
         break;
     }
   }
@@ -344,28 +337,34 @@
   // This function must *not* be called with a non-empty FunctionCallTrie |O|.
   void deepCopyInto(FunctionCallTrie &O) const {
     DCHECK(O.getRoots().empty());
+
+    // We then push the root into a stack, to use as the parent marker for new
+    // nodes we push in as we're traversing depth-first down the call tree.
+    struct NodeAndParent {
+      FunctionCallTrie::Node *Node;
+      FunctionCallTrie::Node *NewNode;
+    };
+    using Stack = Array<NodeAndParent>;
+
+    typename Stack::AllocatorType StackAllocator(
+        profilingFlags()->stack_allocator_max, 0);
+    Stack DFSStack(StackAllocator);
+
     for (const auto Root : getRoots()) {
       // Add a node in O for this root.
       auto NewRoot = O.Nodes.AppendEmplace(
           nullptr, *O.NodeIdPairAllocator, Root->CallCount,
           Root->CumulativeLocalTime, Root->FId);
-      O.Roots.Append(NewRoot);
 
-      // We then push the root into a stack, to use as the parent marker for new
-      // nodes we push in as we're traversing depth-first down the call tree.
-      struct NodeAndParent {
-        FunctionCallTrie::Node *Node;
-        FunctionCallTrie::Node *NewNode;
-      };
-      using Stack = Array<NodeAndParent>;
+      // Because we cannot allocate more memory we should bail out right away.
+      if (UNLIKELY(NewRoot == nullptr))
+        return;
 
-      typename Stack::AllocatorType StackAllocator(
-          profilingFlags()->stack_allocator_max, 0);
-      Stack DFSStack(StackAllocator);
+      O.Roots.Append(NewRoot);
 
       // TODO: Figure out what to do if we fail to allocate any more stack
       // space. Maybe warn or report once?
-      DFSStack.Append(NodeAndParent{Root, NewRoot});
+      DFSStack.AppendEmplace(Root, NewRoot);
       while (!DFSStack.empty()) {
         NodeAndParent NP = DFSStack.back();
         DCHECK_NE(NP.Node, nullptr);
@@ -375,9 +374,10 @@
           auto NewNode = O.Nodes.AppendEmplace(
               NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount,
               Callee.NodePtr->CumulativeLocalTime, Callee.FId);
-          DCHECK_NE(NewNode, nullptr);
+          if (UNLIKELY(NewNode == nullptr))
+            return;
           NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
-          DFSStack.Append(NodeAndParent{Callee.NodePtr, NewNode});
+          DFSStack.AppendEmplace(Callee.NodePtr, NewNode);
         }
       }
     }
@@ -408,6 +408,9 @@
       if (R == nullptr) {
         TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
                                            0, Root->FId);
+        if (UNLIKELY(TargetRoot == nullptr))
+          return;
+
         O.Roots.Append(TargetRoot);
       } else {
         TargetRoot = *R;
@@ -430,10 +433,14 @@
           if (TargetCallee == nullptr) {
             auto NewTargetNode = O.Nodes.AppendEmplace(
                 NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+
+            if (UNLIKELY(NewTargetNode == nullptr))
+              return;
+
             TargetCallee =
                 NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId);
           }
-          DFSStack.Append(NodeAndTarget{Callee.NodePtr, TargetCallee->NodePtr});
+          DFSStack.AppendEmplace(Callee.NodePtr, TargetCallee->NodePtr);
         }
       }
     }
Index: compiler-rt/lib/xray/xray_profile_collector.cc
===================================================================
--- compiler-rt/lib/xray/xray_profile_collector.cc
+++ compiler-rt/lib/xray/xray_profile_collector.cc
@@ -16,8 +16,8 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_vector.h"
 #include "xray_profiling_flags.h"
-#include <pthread.h>
 #include <memory>
+#include <pthread.h>
 #include <utility>
 
 namespace __xray {
@@ -55,7 +55,8 @@
     GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
         InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
     new (GlobalAllocators) FunctionCallTrie::Allocators();
-    *GlobalAllocators = FunctionCallTrie::InitAllocators();
+    *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
+        profilingFlags()->global_allocator_max);
   });
   DCHECK_NE(GlobalAllocators, nullptr);
 
@@ -83,12 +84,11 @@
     //    and is decoupled from the lifetime management required by the managed
     //    allocator we have in XRay.
     //
-    Item->Trie = reinterpret_cast<FunctionCallTrie *>(
-        InternalAlloc(sizeof(FunctionCallTrie)));
+    Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
+        sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
     DCHECK_NE(Item->Trie, nullptr);
     new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
   }
-  DCHECK_NE(Item, nullptr);
 
   T.deepCopyInto(*Item->Trie);
 }
Index: compiler-rt/lib/xray/xray_profiling.cc
===================================================================
--- compiler-rt/lib/xray/xray_profiling.cc
+++ compiler-rt/lib/xray/xray_profiling.cc
@@ -258,9 +258,9 @@
   {
     SpinMutexLock Lock(&ProfilerOptionsMutex);
     FlagParser ConfigParser;
-    auto *F = profilingFlags();
-    F->setDefaults();
-    registerProfilerFlags(&ConfigParser, F);
+    ProfilerFlags Flags;
+    Flags.setDefaults();
+    registerProfilerFlags(&ConfigParser, &Flags);
     ConfigParser.ParseString(profilingCompilerDefinedFlags());
     const char *Env = GetEnv("XRAY_PROFILING_OPTIONS");
     if (Env == nullptr)
@@ -271,6 +271,7 @@
     ConfigParser.ParseString(static_cast<const char *>(Options));
     if (Verbosity())
       ReportUnrecognizedFlags();
+    *profilingFlags() = Flags;
   }
 
   // We need to reset the profile data collection implementation now.
Index: compiler-rt/lib/xray/xray_segmented_array.h
===================================================================
--- compiler-rt/lib/xray/xray_segmented_array.h
+++ compiler-rt/lib/xray/xray_segmented_array.h
@@ -18,21 +18,13 @@
 
 #include "sanitizer_common/sanitizer_allocator.h"
 #include "xray_allocator.h"
+#include "xray_utils.h"
+#include <cassert>
 #include <type_traits>
 #include <utility>
 
 namespace __xray {
 
-namespace {
-
-constexpr size_t gcd(size_t a, size_t b) {
-  return (b == 0) ? a : gcd(b, a % b);
-}
-
-constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
-
-} // namespace
-
 /// The Array type provides an interface similar to std::vector<...> but does
 /// not shrink in size. Once constructed, elements can be appended but cannot be
 /// removed. The implementation is heavily dependent on the contract provided by
@@ -45,10 +37,12 @@
 /// size, to allow us to maximise the number of T objects we can place in
 /// cache-line multiple sized blocks. To get back the number of T's, we divide
 /// this least common multiple by the size of T.
-template <class T, size_t N = lcm(sizeof(T), kCacheLineSize) / sizeof(T)>
+template <class T,
+          size_t N = lcm(next_pow2(sizeof(T)), kCacheLineSize) / sizeof(T)>
 struct Array {
   static constexpr size_t ChunkSize = N;
-  static constexpr size_t AllocatorChunkSize = sizeof(T) * ChunkSize;
+  static constexpr size_t ElementStorageSize = next_pow2(sizeof(T));
+  static constexpr size_t AllocatorChunkSize = ElementStorageSize * ChunkSize;
   using AllocatorType = Allocator<AllocatorChunkSize>;
   static_assert(std::is_trivially_destructible<T>::value,
                 "T must be trivially destructible.");
@@ -60,8 +54,8 @@
   struct Chunk {
     typename AllocatorType::Block Block;
     static constexpr size_t Size = N;
-    Chunk *Prev = nullptr;
-    Chunk *Next = nullptr;
+    Chunk *Prev = this;
+    Chunk *Next = this;
   };
 
   static Chunk SentinelChunk;
@@ -70,7 +64,6 @@
   Chunk *Head = &SentinelChunk;
   Chunk *Tail = &SentinelChunk;
   size_t Size = 0;
-  size_t FreeElements = 0;
 
   // Here we keep track of chunks in the freelist, to allow us to re-use chunks
   // when elements are trimmed off the end.
@@ -85,17 +78,22 @@
       auto *FreeChunk = Freelist;
       Freelist = FreeChunk->Next;
       FreeChunk->Next = &SentinelChunk;
+      Freelist->Prev = &SentinelChunk;
       return FreeChunk;
     }
 
-    auto Block = Alloc->Allocate();
+    auto Block = Alloc->Allocate(ElementStorageSize);
     if (Block.Data == nullptr)
       return nullptr;
+
     // TODO: Maybe use a separate managed allocator for Chunk instances?
-    auto C = reinterpret_cast<Chunk *>(InternalAlloc(sizeof(Chunk)));
+    auto C = reinterpret_cast<Chunk *>(
+        InternalAlloc(sizeof(Chunk), nullptr, kCacheLineSize));
     if (C == nullptr)
       return nullptr;
     C->Block = Block;
+    C->Prev = &SentinelChunk;
+    C->Next = &SentinelChunk;
     return C;
   }
 
@@ -116,42 +114,52 @@
     auto Chunk = NewChunk();
     if (Chunk == nullptr)
       return nullptr;
-    Chunk->Prev = &SentinelChunk;
-    Chunk->Next = &SentinelChunk;
-    Head = Chunk;
-    Tail = Chunk;
-    return Chunk;
+    DCHECK_EQ(Chunk->Next, &SentinelChunk);
+    DCHECK_EQ(Chunk->Prev, &SentinelChunk);
+    return Head = Tail = Chunk;
   }
 
   Chunk *AppendNewChunk() {
     auto Chunk = NewChunk();
     if (Chunk == nullptr)
       return nullptr;
+    DCHECK_NE(Tail, &SentinelChunk);
+    DCHECK_EQ(Tail->Next, &SentinelChunk);
+    DCHECK_EQ(Chunk->Prev, &SentinelChunk);
+    DCHECK_EQ(Chunk->Next, &SentinelChunk);
     Tail->Next = Chunk;
     Chunk->Prev = Tail;
-    Chunk->Next = &SentinelChunk;
     Tail = Chunk;
-    return Chunk;
+    return Tail;
   }
 
   // This Iterator models a BidirectionalIterator.
   template <class U> class Iterator {
-    Chunk *C = nullptr;
+    Chunk *C = &SentinelChunk;
     size_t Offset = 0;
+    size_t Size = 0;
 
   public:
-    Iterator(Chunk *IC, size_t Off) : C(IC), Offset(Off) {}
+    Iterator(Chunk *IC, size_t Off, size_t S) : C(IC), Offset(Off), Size(S) {}
+    Iterator(const Iterator &) noexcept = default;
+    Iterator() noexcept = default;
+    Iterator(Iterator &&) noexcept = default;
+    Iterator &operator=(const Iterator &) = default;
+    Iterator &operator=(Iterator &&) = default;
+    ~Iterator() = default;
 
     Iterator &operator++() {
-      if (++Offset % N)
+      if (++Offset % N || Offset == Size)
         return *this;
 
-      DCHECK_NE(C, &SentinelChunk);
-
       // At this point, we know that Offset % N == 0, so we must advance the
       // chunk pointer.
       DCHECK_EQ(Offset % N, 0);
+      DCHECK_NE(Offset, Size);
+      DCHECK_NE(C, &SentinelChunk);
+      DCHECK_NE(C->Next, &SentinelChunk);
       C = C->Next;
+      DCHECK_NE(C, &SentinelChunk);
       return *this;
     }
 
@@ -159,10 +167,12 @@
       DCHECK_NE(C, &SentinelChunk);
       DCHECK_GT(Offset, 0);
 
-      // We check whether the offset was on a boundary before decrement, to see
-      // whether we need to retreat to the previous chunk.
-      if ((Offset-- % N) == 0)
+      auto PreviousOffset = Offset--;
+      if (PreviousOffset != Size && PreviousOffset % N == 0) {
+        DCHECK_NE(C->Prev, &SentinelChunk);
         C = C->Prev;
+      }
+
       return *this;
     }
 
@@ -190,12 +200,16 @@
 
     U &operator*() const {
       DCHECK_NE(C, &SentinelChunk);
-      return reinterpret_cast<U *>(C->Block.Data)[Offset % N];
+      auto RelOff = Offset % N;
+      return *reinterpret_cast<U *>(reinterpret_cast<char *>(C->Block.Data) +
+                                    (RelOff * ElementStorageSize));
     }
 
     U *operator->() const {
       DCHECK_NE(C, &SentinelChunk);
-      return reinterpret_cast<U *>(C->Block.Data) + (Offset % N);
+      auto RelOff = Offset % N;
+      return reinterpret_cast<U *>(reinterpret_cast<char *>(C->Block.Data) +
+                                   (RelOff * ElementStorageSize));
     }
   };
 
@@ -232,10 +246,11 @@
       if (AppendNewChunk() == nullptr)
         return nullptr;
 
-    auto Position = reinterpret_cast<T *>(Tail->Block.Data) + Offset;
+    auto Position =
+        reinterpret_cast<T *>(reinterpret_cast<char *>(Tail->Block.Data) +
+                              (Offset * ElementStorageSize));
     *Position = E;
     ++Size;
-    FreeElements -= FreeElements ? 1 : 0;
     return Position;
   }
 
@@ -245,16 +260,21 @@
         return nullptr;
 
     auto Offset = Size % N;
-    if (UNLIKELY(Size != 0 && Offset == 0))
-      if (AppendNewChunk() == nullptr)
+    auto *LatestChunk = Tail;
+    if (UNLIKELY(Size != 0 && Offset == 0)) {
+      LatestChunk = AppendNewChunk();
+      if (LatestChunk == nullptr)
         return nullptr;
+    }
 
-    auto Position = reinterpret_cast<T *>(Tail->Block.Data) + Offset;
+    DCHECK_NE(Tail, &SentinelChunk);
+    auto Position = reinterpret_cast<char *>(LatestChunk->Block.Data) +
+                    (Offset * ElementStorageSize);
+    DCHECK_EQ(reinterpret_cast<uintptr_t>(Position) % ElementStorageSize, 0);
     // In-place construct at Position.
-    new (Position) T(std::forward<Args>(args)...);
+    new (Position) T{std::forward<Args>(args)...};
     ++Size;
-    FreeElements -= FreeElements ? 1 : 0;
-    return Position;
+    return reinterpret_cast<T *>(Position);
   }
 
   T &operator[](size_t Offset) const {
@@ -266,20 +286,22 @@
       Offset -= N;
       DCHECK_NE(C, &SentinelChunk);
     }
-    auto Position = reinterpret_cast<T *>(C->Block.Data) + Offset;
-    return *Position;
+    return *reinterpret_cast<T *>(reinterpret_cast<char *>(C->Block.Data) +
+                                  (Offset * ElementStorageSize));
   }
 
   T &front() const {
     DCHECK_NE(Head, &SentinelChunk);
     DCHECK_NE(Size, 0u);
-    return *reinterpret_cast<T *>(Head->Block.Data);
+    return *begin();
   }
 
   T &back() const {
     DCHECK_NE(Tail, &SentinelChunk);
-    auto Offset = (Size - 1) % N;
-    return *(reinterpret_cast<T *>(Tail->Block.Data) + Offset);
+    DCHECK_NE(Size, 0u);
+    auto It = end();
+    --It;
+    return *It;
   }
 
   template <class Predicate> T *find_element(Predicate P) const {
@@ -298,15 +320,18 @@
   /// require allocation of new blocks for new elements added after trimming.
   void trim(size_t Elements) {
     DCHECK_LE(Elements, Size);
+    DCHECK_GT(Size, 0);
+    auto OldSize = Size;
     Size -= Elements;
-    FreeElements += Elements;
-
-    // Here we need to check whether we've cleared enough elements to warrant
-    // putting blocks on to the freelist. We determine whether we need to
-    // right-size the internal list, by keeping track of the number of "free"
-    // elements still in the array.
-    auto ChunksToTrim = FreeElements / N;
-    for (size_t i = 0; i < ChunksToTrim; ++i, FreeElements -= N) {
+
+    DCHECK_NE(Head, &SentinelChunk);
+    DCHECK_NE(Tail, &SentinelChunk);
+
+    for (auto ChunksToTrim =
+             nearest_boundary(OldSize - nearest_boundary(Size, N), N) / N;
+         ChunksToTrim > 0; --ChunksToTrim) {
+      DCHECK_NE(Head, &SentinelChunk);
+      DCHECK_NE(Tail, &SentinelChunk);
       // Put the tail into the Freelist.
       auto *FreeChunk = Tail;
       Tail = Tail->Prev;
@@ -314,17 +339,21 @@
         Head = Tail;
       else
         Tail->Next = &SentinelChunk;
+
+      DCHECK_EQ(Tail->Next, &SentinelChunk);
       FreeChunk->Next = Freelist;
-      FreeChunk->Prev = Freelist->Prev;
+      FreeChunk->Prev = &SentinelChunk;
+      if (Freelist != &SentinelChunk)
+        Freelist->Prev = FreeChunk;
       Freelist = FreeChunk;
     }
   }
 
   // Provide iterators.
-  Iterator<T> begin() const { return Iterator<T>(Head, 0); }
-  Iterator<T> end() const { return Iterator<T>(Tail, Size); }
-  Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0); }
-  Iterator<const T> cend() const { return Iterator<const T>(Tail, Size); }
+  Iterator<T> begin() const { return Iterator<T>(Head, 0, Size); }
+  Iterator<T> end() const { return Iterator<T>(Tail, Size, Size); }
+  Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0, Size); }
+  Iterator<const T> cend() const { return Iterator<const T>(Tail, Size, Size); }
 };
 
 // We need to have this storage definition out-of-line so that the compiler can
Index: compiler-rt/lib/xray/xray_utils.h
===================================================================
--- compiler-rt/lib/xray/xray_utils.h
+++ compiler-rt/lib/xray/xray_utils.h
@@ -36,6 +36,24 @@
 // file.
 int getLogFD();
 
+constexpr size_t gcd(size_t a, size_t b) {
+  return (b == 0) ? a : gcd(b, a % b);
+}
+
+constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
+
+template <class T> constexpr T nearest_boundary(T number, T multiple) {
+  return multiple * ((number / multiple) + (number % multiple ? 1 : 0));
+}
+
+constexpr size_t next_pow2_helper(size_t num, size_t acc) {
+  return (1u << acc) >= num ? (1u << acc) : next_pow2_helper(num, acc + 1);
+}
+
+constexpr size_t next_pow2(size_t number) {
+  return next_pow2_helper(number, 1);
+}
+
 } // namespace __xray
 
 #endif // XRAY_UTILS_H
Index: llvm/include/llvm/XRay/Profile.h
===================================================================
--- /dev/null
+++ llvm/include/llvm/XRay/Profile.h
@@ -0,0 +1,109 @@
+//===- Profile.h - XRay Profile Abstraction -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the XRay Profile class representing the latency profile generated by
+// XRay's profiling mode.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_XRAY_PROFILE_H
+#define LLVM_XRAY_PROFILE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <list>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+namespace xray {
+
+class Profile;
+
+/// This function will attempt to load an XRay Profiling Mode profile from the
+/// provided |Filename|.
+Expected<Profile> loadProfile(StringRef Filename);
+
+/// This algorithm will merge two Profile instances into a single Profile
+/// instance, aggregating blocks by Thread ID.
+Profile mergeProfilesByThread(const Profile &L, const Profile &R);
+
+/// Profile instances are thread-compatible.
+class Profile {
+public:
+  using ThreadID = uint64_t;
+  using PathID = unsigned;
+  using FuncID = int32_t;
+
+  struct Data {
+    uint64_t CallCount;
+    uint64_t CumulativeLocalTime;
+  };
+
+  struct Block {
+    ThreadID Thread;
+    std::vector<std::pair<PathID, Data>> PathData;
+  };
+
+  /// Provides a sequence of function IDs from a previously interned PathID.
+  Expected<std::vector<FuncID>> path(PathID P) const;
+
+  /// The stack represented in |P| must be in stack order (leaf to root).
+  PathID internPath(ArrayRef<FuncID> P);
+
+  /// Appends a fully-formed Block instance into the Profile.
+  Error addBlock(Block &&B);
+
+  // No copies.
+  Profile(const Profile &) = delete;
+  Profile &operator=(const Profile &) = delete;
+
+  // Move only.
+  Profile(Profile &&) noexcept = default;
+  Profile &operator=(Profile &&) noexcept = default;
+
+  Profile() = default;
+  ~Profile() = default;
+
+private:
+  using BlockList = std::list<Block>;
+
+  struct TrieNode {
+    FuncID Func = 0;
+    std::vector<TrieNode *> Callees{};
+    TrieNode *Caller = nullptr;
+    PathID ID = 0;
+  };
+
+  // List of blocks associated with a Profile.
+  BlockList Blocks;
+
+  // List of TrieNode elements we've seen.
+  std::list<TrieNode> NodeStorage;
+
+  // List of call stack roots.
+  SmallVector<TrieNode *, 4> Roots;
+
+  // Reverse mapping between a PathID to a TrieNode*.
+  DenseMap<PathID, TrieNode *> PathIDMap;
+
+  // Used to increment
+  PathID NextID = 1;
+
+public:
+  using const_iterator = BlockList::const_iterator;
+  const_iterator begin() const { return Blocks.begin(); }
+  const_iterator end() const { return Blocks.end(); }
+};
+
+} // namespace xray
+} // namespace llvm
+
+#endif
Index: llvm/include/llvm/XRay/Trace.h
===================================================================
--- llvm/include/llvm/XRay/Trace.h
+++ llvm/include/llvm/XRay/Trace.h
@@ -18,7 +18,6 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/XRay/XRayRecord.h"
 
 namespace llvm {
@@ -62,7 +61,9 @@
 };
 
 /// This function will attempt to load XRay trace records from the provided
-/// |Filename|.
+/// |Filename|. If the file provided contains an XRay Profile, this will error
+/// out and suggest to the user that the llvm/XRay/Profile.h header and
+/// `loadProfile(...)` function be called instead.
 Expected<Trace> loadTraceFile(StringRef Filename, bool Sort = false);
 
 } // namespace xray
Index: llvm/lib/XRay/CMakeLists.txt
===================================================================
--- llvm/lib/XRay/CMakeLists.txt
+++ llvm/lib/XRay/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(LLVMXRay
   InstrumentationMap.cpp
+	Profile.cpp
   Trace.cpp
 
   ADDITIONAL_HEADER_DIRS
Index: llvm/lib/XRay/Profile.cpp
===================================================================
--- /dev/null
+++ llvm/lib/XRay/Profile.cpp
@@ -0,0 +1,280 @@
+//===- Profile.cpp - XRay Profile Abstraction -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the XRay Profile class representing the latency profile generated by
+// XRay's profiling mode.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/XRay/Profile.h"
+
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include <deque>
+#include <memory>
+
+namespace llvm {
+namespace xray {
+
+namespace {
+
+struct BlockHeader {
+  uint32_t Size;
+  uint32_t Number;
+  uint64_t Thread;
+};
+
+static Expected<BlockHeader> readBlockHeader(DataExtractor &Extractor,
+                                             uint32_t &Offset) {
+  BlockHeader H;
+  uint32_t CurrentOffset = Offset;
+  H.Size = Extractor.getU32(&Offset);
+  if (Offset == CurrentOffset)
+    return make_error<StringError>(
+        Twine("Error parsing block header size at offset '") +
+            Twine(CurrentOffset) + "'",
+        std::make_error_code(std::errc::invalid_argument));
+  CurrentOffset = Offset;
+  H.Number = Extractor.getU32(&Offset);
+  if (Offset == CurrentOffset)
+    return make_error<StringError>(
+        Twine("Error parsing block header number at offset '") +
+            Twine(CurrentOffset) + "'",
+        std::make_error_code(std::errc::invalid_argument));
+  CurrentOffset = Offset;
+  H.Thread = Extractor.getU64(&Offset);
+  if (Offset == CurrentOffset)
+    return make_error<StringError>(
+        Twine("Error parsing block header thread id at offset '") +
+            Twine(CurrentOffset) + "'",
+        std::make_error_code(std::errc::invalid_argument));
+  return H;
+}
+
+static Expected<std::vector<Profile::FuncID>> readPath(DataExtractor &Extractor,
+                                                       uint32_t &Offset) {
+  // We're reading a sequence of int32_t's until we find a 0.
+  std::vector<Profile::FuncID> Path;
+  auto CurrentOffset = Offset;
+  int32_t FuncId;
+  do {
+    FuncId = Extractor.getSigned(&Offset, 4);
+    if (CurrentOffset == Offset)
+      return make_error<StringError>(
+          Twine("Error parsing path at offset '") + Twine(CurrentOffset) + "'",
+          std::make_error_code(std::errc::invalid_argument));
+    CurrentOffset = Offset;
+    Path.push_back(FuncId);
+  } while (FuncId != 0);
+  return std::move(Path);
+}
+
+static Expected<Profile::Data> readData(DataExtractor &Extractor,
+                                        uint32_t &Offset) {
+  // We expect a certain number of elements for Data:
+  //   - A 64-bit CallCount
+  //   - A 64-bit CumulativeLocalTime counter
+  Profile::Data D;
+  auto CurrentOffset = Offset;
+  D.CallCount = Extractor.getU64(&Offset);
+  if (CurrentOffset == Offset)
+    return make_error<StringError>(
+        Twine("Error parsing call counts at offset '") + Twine(CurrentOffset) +
+            "'",
+        std::make_error_code(std::errc::invalid_argument));
+  CurrentOffset = Offset;
+  D.CumulativeLocalTime = Extractor.getU64(&Offset);
+  if (CurrentOffset == Offset)
+    return make_error<StringError>(
+        Twine("Error parsing cumulative local time at offset '") +
+            Twine(CurrentOffset) + "'",
+        std::make_error_code(std::errc::invalid_argument));
+  return D;
+}
+
+} // namespace
+
+Profile::PathID Profile::internPath(ArrayRef<Profile::FuncID> Path) {
+  if (Path.empty())
+    return 0;
+
+  auto RevPath = reverse(Path);
+
+  // Find the root.
+  auto It = RevPath.begin();
+  auto PathRoot = *It++;
+  auto RootIt =
+      find_if(Roots, [PathRoot](TrieNode *N) { return N->Func == PathRoot; });
+
+  // If we've not seen this root before, remember it.
+  TrieNode *Node = nullptr;
+  if (RootIt == Roots.end()) {
+    NodeStorage.emplace_back();
+    Node = &NodeStorage.back();
+    Node->Func = PathRoot;
+    Roots.push_back(Node);
+  } else {
+    Node = *RootIt;
+  }
+
+  // Now traverse the path, re-creating if necessary.
+  while (It != RevPath.end()) {
+    auto NodeFuncID = *It++;
+    auto CalleeIt = find_if(Node->Callees, [NodeFuncID](TrieNode *N) {
+      return N->Func == NodeFuncID;
+    });
+    if (CalleeIt == Node->Callees.end()) {
+      NodeStorage.emplace_back();
+      auto NewNode = &NodeStorage.back();
+      NewNode->Func = NodeFuncID;
+      NewNode->Caller = Node;
+      Node->Callees.push_back(NewNode);
+      Node = NewNode;
+    } else {
+      Node = *CalleeIt;
+    }
+  }
+
+  // At this point, Node *must* be pointing at the leaf.
+  assert(Node->Func == Path.front());
+  if (Node->ID == 0) {
+    Node->ID = NextID++;
+    PathIDMap.insert({Node->ID, Node});
+  }
+  return Node->ID;
+}
+
+Error Profile::addBlock(Block &&B) {
+  if (B.Thread == 0)
+    return make_error<StringError>(
+        "Block may not have Thread ID 0.",
+        std::make_error_code(std::errc::invalid_argument));
+  if (B.PathData.empty())
+    return make_error<StringError>(
+        "Block may not have empty path data.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  Blocks.emplace_back(std::move(B));
+  return Error::success();
+}
+
+Expected<std::vector<Profile::FuncID>> Profile::path(Profile::PathID P) const {
+  auto It = PathIDMap.find(P);
+  if (It == PathIDMap.end())
+    return make_error<StringError>(
+        Twine("PathID not found: ") + Twine(P),
+        std::make_error_code(std::errc::invalid_argument));
+  std::vector<Profile::FuncID> Path;
+  for (auto Node = It->second; Node; Node = Node->Caller)
+    Path.push_back(Node->Func);
+  return std::move(Path);
+}
+
+Profile mergeProfilesByThread(const Profile &L, const Profile &R) {
+  Profile Merged;
+  using PathDataMap = DenseMap<Profile::PathID, Profile::Data>;
+  using PathDataMapPtr = std::unique_ptr<PathDataMap>;
+  using PathDataVector = decltype(Profile::Block::PathData);
+  DenseMap<Profile::ThreadID, PathDataMapPtr> ThreadProfileIndex;
+  for (const auto &Block : L) {
+    auto Res = ThreadProfileIndex.insert(
+        {Block.Thread, PathDataMapPtr{new PathDataMap()}});
+    auto &It = Res.first;
+    for (const auto &PathAndData : Block.PathData) {
+      auto &PathID = PathAndData.first;
+      auto &Data = PathAndData.second;
+      auto NewPathID = Merged.internPath(cantFail(L.path(PathID)));
+      It->second->insert({NewPathID, Data});
+    }
+  }
+  for (const auto &Block : R) {
+    auto Res = ThreadProfileIndex.insert(
+        {Block.Thread, PathDataMapPtr{new PathDataMap()}});
+    auto &It = Res.first;
+    for (const auto &PathAndData : Block.PathData) {
+      auto &PathID = PathAndData.first;
+      auto &Data = PathAndData.second;
+      auto NewPathID = Merged.internPath(cantFail(R.path(PathID)));
+      PathDataMap::iterator PathDataIt;
+      bool Inserted;
+      std::tie(PathDataIt, Inserted) = It->second->insert({NewPathID, Data});
+      if (!Inserted) {
+        auto &ExistingData = PathDataIt->second;
+        ExistingData.CallCount += Data.CallCount;
+        ExistingData.CumulativeLocalTime += Data.CumulativeLocalTime;
+      }
+    }
+  }
+
+  for (const auto &IndexedThreadBlock : ThreadProfileIndex) {
+    PathDataVector PathAndData;
+    PathAndData.reserve(IndexedThreadBlock.second->size());
+    copy(*IndexedThreadBlock.second, std::back_inserter(PathAndData));
+    cantFail(
+        Merged.addBlock({IndexedThreadBlock.first, std::move(PathAndData)}));
+  }
+  return Merged;
+}
+
+Expected<Profile> loadProfile(StringRef Filename) {
+  int Fd;
+  if (auto EC = sys::fs::openFileForRead(Filename, Fd))
+    return make_error<StringError>(
+        Twine("Cannot read profile from '") + Filename + "'", EC);
+
+  uint64_t FileSize;
+  if (auto EC = sys::fs::file_size(Filename, FileSize))
+    return make_error<StringError>(
+        Twine("Cannot get filesize of '") + Filename + "'", EC);
+
+  std::error_code EC;
+  sys::fs::mapped_file_region MappedFile(
+      Fd, sys::fs::mapped_file_region::mapmode::readonly, FileSize, 0, EC);
+  if (EC)
+    return make_error<StringError>(
+        Twine("Cannot mmap profile '") + Filename + "'", EC);
+  StringRef Data(MappedFile.data(), MappedFile.size());
+
+  Profile P;
+  uint32_t Offset = 0;
+  DataExtractor Extractor(Data, true, 8);
+
+  // For each block we get from the file:
+  while (Offset != MappedFile.size()) {
+    auto HeaderOrError = readBlockHeader(Extractor, Offset);
+    if (!HeaderOrError)
+      return HeaderOrError.takeError();
+
+    // TODO: Maybe store this header information for each block, even just for
+    // debugging?
+    const auto &Header = HeaderOrError.get();
+
+    // Read in the path data.
+    auto PathOrError = readPath(Extractor, Offset);
+    if (!PathOrError)
+      return PathOrError.takeError();
+    const auto &Path = PathOrError.get();
+
+    // For each path we encounter, we should intern it to get a PathID.
+    auto DataOrError = readData(Extractor, Offset);
+    if (!DataOrError)
+      return DataOrError.takeError();
+    auto &Data = DataOrError.get();
+
+    if (auto E =
+            P.addBlock(Profile::Block{Profile::ThreadID{Header.Thread},
+                                      {{P.internPath(Path), std::move(Data)}}}))
+      return std::move(E);
+  }
+
+  return P;
+}
+
+} // namespace xray
+} // namespace llvm
Index: llvm/lib/XRay/Trace.cpp
===================================================================
--- llvm/lib/XRay/Trace.cpp
+++ llvm/lib/XRay/Trace.cpp
@@ -30,16 +30,34 @@
   // FIXME: Maybe deduce whether the data is little or big-endian using some
   // magic bytes in the beginning of the file?
 
-  // First 32 bytes of the file will always be the header. We assume a certain
-  // format here:
+  // First 32 bytes of the file will always be the header. For FDR and Basic
+  // mode, we assume a specific format:
   //
   //   (2)   uint16 : version
   //   (2)   uint16 : type
   //   (4)   uint32 : bitfield
   //   (8)   uint64 : cycle frequency
   //   (16)  -      : padding
+  //
+  // We also look for the Profiling Mode output, which uses an alternate data
+  // format using a set of magic bytes at the beginning of the file. If we see
+  // the first 64 bits (16 bytes) to be that set of magic bytes, then we bail
+  // out advising the caller to use a separate loader. We're doing this because
+  // the data format for the profiling mode doesn't provide access to specific
+  // events compared to FDR and Basic mode.
 
   DataExtractor HeaderExtractor(Data, true, 8);
+
+  {
+    uint32_t OffsetPtr = 0;
+    uint64_t MagicBytes = HeaderExtractor.getU64(&OffsetPtr);
+    if (MagicBytes == 0x7872617970726f66)
+      return make_error<StringError>(
+          Twine("Detected XRay profiling mode file; use the `Profile` loader "
+                "instead."),
+          std::make_error_code(std::errc::invalid_argument));
+  }
+
   uint32_t OffsetPtr = 0;
   FileHeader.Version = HeaderExtractor.getU16(&OffsetPtr);
   FileHeader.Type = HeaderExtractor.getU16(&OffsetPtr);
@@ -122,8 +140,8 @@
         return make_error<StringError>(
             Twine("Corrupted log, found arg payload following non-matching "
                   "function + thread record. Record for function ") +
-                Twine(Record.FuncId) + " != " + Twine(FuncId) + "; offset: " +
-                Twine(S.data() - Data.data()),
+                Twine(Record.FuncId) + " != " + Twine(FuncId) +
+                "; offset: " + Twine(S.data() - Data.data()),
             std::make_error_code(std::errc::executable_format_error));
       // Advance another four bytes to avoid padding.
       OffsetPtr += 4;
@@ -708,9 +726,9 @@
 
   if (Sort)
     std::stable_sort(T.Records.begin(), T.Records.end(),
-              [&](const XRayRecord &L, const XRayRecord &R) {
-                return L.TSC < R.TSC;
-              });
+                     [&](const XRayRecord &L, const XRayRecord &R) {
+                       return L.TSC < R.TSC;
+                     });
 
   return std::move(T);
 }
Index: llvm/tools/llvm-xray/xray-stacks.cpp
===================================================================
--- llvm/tools/llvm-xray/xray-stacks.cpp
+++ llvm/tools/llvm-xray/xray-stacks.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/XRay/Graph.h"
 #include "llvm/XRay/InstrumentationMap.h"
+#include "llvm/XRay/Profile.h"
 #include "llvm/XRay/Trace.h"
 
 using namespace llvm;
@@ -723,9 +724,18 @@
   symbolize::LLVMSymbolizer Symbolizer(Opts);
   FuncIdConversionHelper FuncIdHelper(StacksInstrMap, Symbolizer,
                                       Map.getFunctionAddresses());
+
   // TODO: Someday, support output to files instead of just directly to
   // standard output.
   for (const auto &Filename : StackInputs) {
+    // First, we try to load the file using the Profile loader. If we happen to
+    // get a Profile object, we can then translate those directly into the
+    // format that we need.
+    auto ProfileOrErr = loadProfile(Filename);
+    if (!ProfileOrErr)
+      logAllUnhandledErrors(ProfileOrErr.takeError(), errs(), "");
+
+    // TODO: Handle Profile objects.
     auto TraceOrErr = loadTraceFile(Filename);
     if (!TraceOrErr) {
       if (!StackKeepGoing)
Index: llvm/unittests/XRay/CMakeLists.txt
===================================================================
--- llvm/unittests/XRay/CMakeLists.txt
+++ llvm/unittests/XRay/CMakeLists.txt
@@ -1,9 +1,11 @@
 set(LLVM_LINK_COMPONENTS
   Support
+	XRay
   )
 
 add_llvm_unittest(XRayTests
   GraphTest.cpp
+	ProfileTest.cpp
   )
 
 add_dependencies(XRayTests intrinsics_gen)
Index: llvm/unittests/XRay/ProfileTest.cpp
===================================================================
--- /dev/null
+++ llvm/unittests/XRay/ProfileTest.cpp
@@ -0,0 +1,70 @@
+//===- ProfileTest.cpp - XRay Profile unit tests ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/XRay/Profile.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace xray {
+namespace {
+
+using ::testing::AllOf;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Field;
+using ::testing::Not;
+using ::testing::Pair;
+
+TEST(ProfileTest, CreateProfile) { Profile P; }
+
+TEST(ProfileTest, InternPath) {
+  Profile P;
+  auto Path0 = P.internPath({3, 2, 1});
+  auto Path1 = P.internPath({3, 2, 1});
+  auto Path2 = P.internPath({2, 1});
+  EXPECT_THAT(Path0, Eq(Path1));
+  EXPECT_THAT(Path0, Not(Eq(Path2)));
+}
+
+TEST(ProfileTest, AddBlocks) {
+  Profile P;
+  EXPECT_TRUE(errorToBool(P.addBlock({})));
+  EXPECT_TRUE(errorToBool(P.addBlock({1, {}})));
+  EXPECT_FALSE(errorToBool(P.addBlock(
+      Profile::Block{Profile::ThreadID{1},
+                     {
+                         {P.internPath({2, 1}), Profile::Data{1, 1000}},
+                         {P.internPath({3, 2, 1}), Profile::Data{10, 100}},
+                     }})));
+}
+
+TEST(ProfileTest, MergeProfilesByThread) {
+  Profile P0, P1;
+  EXPECT_FALSE(errorToBool(P0.addBlock(
+      Profile::Block{Profile::ThreadID{1},
+                     {{P0.internPath({2, 1}), Profile::Data{1, 1000}}}})));
+  EXPECT_FALSE(errorToBool(P1.addBlock(
+      Profile::Block{Profile::ThreadID{1},
+                     {{P1.internPath({2, 1}), Profile::Data{1, 1000}}}})));
+
+  Profile Merged = mergeProfilesByThread(P0, P1);
+  EXPECT_THAT(Merged,
+              ElementsAre(AllOf(
+                  Field(&Profile::Block::Thread, Eq(Profile::ThreadID{1})),
+                  Field(&Profile::Block::PathData,
+                        ElementsAre(Pair(
+                            Merged.internPath({2, 1}),
+                            AllOf(Field(&Profile::Data::CallCount, Eq(2u)),
+                                  Field(&Profile::Data::CumulativeLocalTime,
+                                        Eq(2000u)))))))));
+}
+
+} // namespace
+} // namespace xray
+} // namespace llvm