Index: compiler-rt/lib/xray/CMakeLists.txt
===================================================================
--- compiler-rt/lib/xray/CMakeLists.txt
+++ compiler-rt/lib/xray/CMakeLists.txt
@@ -2,33 +2,61 @@
 
 # XRay runtime library implementation files.
 set(XRAY_SOURCES
-  xray_init.cc
-  xray_flags.cc
-  xray_interface.cc
-  xray_log_interface.cc
-  xray_utils.cc)
+    xray_init.cc
+    xray_flags.cc
+    xray_interface.cc
+    xray_log_interface.cc
+    xray_utils.cc)
 
 # Implementation files for all XRay modes.
 set(XRAY_FDR_MODE_SOURCES
-  xray_buffer_queue.cc
-  xray_fdr_logging.cc)
+    xray_buffer_queue.cc
+    xray_fdr_logging.cc)
 
 set(XRAY_BASIC_MODE_SOURCES
-  xray_inmemory_log.cc)
+    xray_inmemory_log.cc)
+
+set(XRAY_PROFILER_MODE_SOURCES
+    xray_profile_collector.cc
+    xray_profiler.cc
+    xray_profiler_flags.cc)
 
 # Implementation files for all XRay architectures.
-set(aarch64_SOURCES xray_AArch64.cc xray_trampoline_AArch64.S)
-set(arm_SOURCES xray_arm.cc xray_trampoline_arm.S)
-set(armhf_SOURCES ${arm_SOURCES})
-set(mips_SOURCES xray_mips.cc xray_trampoline_mips.S)
-set(mipsel_SOURCES xray_mips.cc xray_trampoline_mips.S)
-set(mips64_SOURCES xray_mips64.cc xray_trampoline_mips64.S)
-set(mips64el_SOURCES xray_mips64.cc xray_trampoline_mips64.S)
+set(x86_64_SOURCES
+    xray_x86_64.cc
+    xray_trampoline_x86_64.S)
+
+set(arm_SOURCES
+    xray_arm.cc
+    xray_trampoline_arm.S)
+
+set(armhf_SOURCES
+    ${arm_SOURCES})
+
+set(aarch64_SOURCES
+    xray_AArch64.cc
+    xray_trampoline_AArch64.S)
+
+set(mips_SOURCES
+    xray_mips.cc
+    xray_trampoline_mips.S)
+
+set(mipsel_SOURCES
+    xray_mips.cc
+    xray_trampoline_mips.S)
+
+set(mips64_SOURCES
+    xray_mips64.cc
+    xray_trampoline_mips64.S)
+
+set(mips64el_SOURCES
+    xray_mips64.cc
+    xray_trampoline_mips64.S)
+
 set(powerpc64le_SOURCES
     xray_powerpc64.cc
     xray_trampoline_powerpc64.cc
     xray_trampoline_powerpc64_asm.S)
-set(x86_64_SOURCES xray_x86_64.cc xray_trampoline_x86_64.S)
 
 # Now put it all together...
 include_directories(..)
@@ -72,6 +100,12 @@
     SOURCES ${XRAY_BASIC_MODE_SOURCES}
     CFLAGS ${XRAY_CFLAGS}
     DEFS ${XRAY_COMMON_DEFINITIONS})
+  add_compiler_rt_object_libraries(RTXrayPROFILER
+    OS ${XRAY_SUPPORTED_OS}
+    ARCHS ${XRAY_SUPPORTED_ARCH}
+    SOURCES ${XRAY_PROFILER_MODE_SOURCES}
+    CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS})
 
   # We only support running on osx for now.
   add_compiler_rt_runtime(clang_rt.xray
@@ -106,6 +140,16 @@
     LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
     LINK_LIBS ${XRAY_LINK_LIBS}
     PARENT_TARGET xray)
+  add_compiler_rt_runtime(clang_rt.xray-profiler
+    STATIC
+    OS ${XRAY_SUPPORTED_OS}
+    ARCHS ${XRAY_SUPPORTED_ARCH}
+    OBJECT_LIBS RTXrayPROFILER
+    CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS}
+    LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
+    LINK_LIBS ${XRAY_LINK_LIBS}
+    PARENT_TARGET xray)
 else() # not Apple
   foreach(arch ${XRAY_SUPPORTED_ARCH})
     if(NOT CAN_TARGET_${arch})
@@ -123,6 +167,10 @@
       ARCHS ${arch}
       SOURCES ${XRAY_BASIC_MODE_SOURCES} CFLAGS ${XRAY_CFLAGS}
       DEFS ${XRAY_COMMON_DEFINITIONS})
+    add_compiler_rt_object_libraries(RTXrayPROFILER
+      ARCHS ${arch}
+      SOURCES ${XRAY_PROFILER_MODE_SOURCES} CFLAGS ${XRAY_CFLAGS}
+      DEFS ${XRAY_COMMON_DEFINITIONS})
 
     # Common XRay archive for instrumented binaries.
     add_compiler_rt_runtime(clang_rt.xray
@@ -148,6 +196,14 @@
       DEFS ${XRAY_COMMON_DEFINITIONS}
       OBJECT_LIBS RTXrayBASIC
       PARENT_TARGET xray)
+   # Profiler Mode runtime
+   add_compiler_rt_runtime(clang_rt.xray-profiler
+     STATIC
+     ARCHS ${arch}
+     CFLAGS ${XRAY_CFLAGS}
+     DEFS ${XRAY_COMMON_DEFINITIONS}
+     OBJECT_LIBS RTXrayPROFILER
+     PARENT_TARGET xray)
   endforeach()
 endif() # not Apple
 
Index: compiler-rt/lib/xray/tests/CMakeLists.txt
===================================================================
--- compiler-rt/lib/xray/tests/CMakeLists.txt
+++ compiler-rt/lib/xray/tests/CMakeLists.txt
@@ -72,6 +72,7 @@
     add_xray_lib("RTXRay.test.osx"
       $<TARGET_OBJECTS:RTXray.osx>
       $<TARGET_OBJECTS:RTXrayFDR.osx>
+      $<TARGET_OBJECTS:RTXrayPROFILER.osx>
       $<TARGET_OBJECTS:RTSanitizerCommon.osx>
       $<TARGET_OBJECTS:RTSanitizerCommonLibc.osx>)
   else()
@@ -79,6 +80,7 @@
     add_xray_lib("RTXRay.test.${arch}"
       $<TARGET_OBJECTS:RTXray.${arch}>
       $<TARGET_OBJECTS:RTXrayFDR.${arch}>
+      $<TARGET_OBJECTS:RTXrayPROFILER.${arch}>
       $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
       $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>)
   endforeach()
Index: compiler-rt/lib/xray/tests/unit/CMakeLists.txt
===================================================================
--- compiler-rt/lib/xray/tests/unit/CMakeLists.txt
+++ compiler-rt/lib/xray/tests/unit/CMakeLists.txt
@@ -2,3 +2,11 @@
   buffer_queue_test.cc xray_unit_test_main.cc)
 add_xray_unittest(XRayFDRLoggingTest SOURCES
   fdr_logging_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRayAllocatorTest SOURCES
+  allocator_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRaySegmentedArrayTest SOURCES
+  segmented_array_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRayFunctionCallTrieTest SOURCES
+  function_call_trie_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRayProfileCollectorTest SOURCES
+  profile_collector_test.cc xray_unit_test_main.cc)
Index: compiler-rt/lib/xray/tests/unit/allocator_test.cc
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/tests/unit/allocator_test.cc
@@ -0,0 +1,42 @@
+//===-- allocator_test.cc -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+
+#include "xray_allocator.h"
+#include "gtest/gtest.h"
+
+namespace __xray {
+namespace {
+
+struct TestData {
+  s64 First;
+  s64 Second;
+};
+
+TEST(AllocatorTest, Construction) { Allocator<sizeof(TestData)> A(2 << 11, 0); }
+
+TEST(AllocatorTest, Allocate) {
+  Allocator<sizeof(TestData)> A(2 << 11, 0);
+  auto B = A.Allocate();
+  (void)B;
+}
+
+TEST(AllocatorTest, OverAllocate) {
+  Allocator<sizeof(TestData)> A(sizeof(TestData), 0);
+  auto B1 = A.Allocate();
+  (void)B1;
+  auto B2 = A.Allocate();
+  ASSERT_EQ(B2.Data, nullptr);
+}
+
+} // namespace
+} // namespace __xray
Index: compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc
@@ -0,0 +1,225 @@
+//===-- function_call_trie_test.cc ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "xray_function_call_trie.h"
+
+namespace __xray {
+
+namespace {
+
+TEST(FunctionCallTrieTest, Construction) {
+  // We want to make sure that we can create one of these without the set of
+  // allocators we need. This will by default use the global allocators.
+  FunctionCallTrie Trie;
+}
+
+TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) {
+  // FIXME: Support passing in configuration for allocators in the allocator
+  // constructors.
+  profilerFlags()->setDefaults();
+  thread_local FunctionCallTrie::Allocators Allocators =
+      FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(Allocators);
+}
+
+TEST(FunctionCallTrieTest, EnterAndExitFunction) {
+  profilerFlags()->setDefaults();
+  thread_local auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  Trie.enterFunction(1, 1);
+  Trie.exitFunction(1, 2);
+
+  // We need a way to pull the data out. At this point, until we get a data
+  // collection service implemented, we're going to export the data as a list of
+  // roots, and manually walk through the structure ourselves.
+
+  const auto &R = Trie.getRoots();
+
+  ASSERT_EQ(R.size(), 1u);
+  ASSERT_EQ(R.front()->FId, 1);
+  ASSERT_EQ(R.front()->CallCount, 1);
+  ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
+}
+
+TEST(FunctionCallTrieTest, MissingFunctionEntry) {
+  thread_local auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+  Trie.exitFunction(1, 1);
+  const auto &R = Trie.getRoots();
+
+  ASSERT_TRUE(R.empty());
+}
+
+TEST(FunctionCallTrieTest, MultipleRoots) {
+  profilerFlags()->setDefaults();
+  thread_local auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  // Enter and exit FId = 1.
+  Trie.enterFunction(1, 1);
+  Trie.exitFunction(1, 2);
+
+  // Enter and exit FId = 2.
+  Trie.enterFunction(2, 3);
+  Trie.exitFunction(2, 4);
+
+  const auto &R = Trie.getRoots();
+  ASSERT_FALSE(R.empty());
+  ASSERT_EQ(R.size(), 2u);
+
+  // Inspect the roots that they have the right data.
+  const auto R0 = R[0];
+  ASSERT_NE(R0, nullptr);
+  EXPECT_EQ(R0->CallCount, 1u);
+  EXPECT_EQ(R0->CumulativeLocalTime, 1u);
+
+  const auto R1 = R[1];
+  ASSERT_NE(R1, nullptr);
+  EXPECT_EQ(R1->CallCount, 1u);
+  EXPECT_EQ(R1->CumulativeLocalTime, 1u);
+}
+
+// While missing an intermediary entry may be rare in practice, we still enforce
+// that we can handle the case where we've missed the entry event somehow, in
+// between call entry/exits. To illustrate, imagine the following shadow call
+// stack:
+//
+//   f0@t0 -> f1@t1 -> f2@t2
+//
+// If for whatever reason we see an exit for `f2` @ t3, followed by an exit for
+// `f0` @ t4 (i.e. no `f1` exit in between) then we need to handle the case of
+// accounting local time to `f2` from d = (t3 - t2), then local time to `f1`
+// as d' = (t3 - t1) - d, and then local time to `f0` as d'' = (t3 - t0) - d'.
+TEST(FunctionCallTrieTest, MissingIntermediaryExit) {
+  profilerFlags()->setDefaults();
+  thread_local auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  Trie.enterFunction(1, 0);
+  Trie.enterFunction(2, 100);
+  Trie.enterFunction(3, 200);
+  Trie.exitFunction(3, 300);
+  Trie.exitFunction(1, 400);
+
+  // What we should see at this point is all the functions in the trie in a
+  // specific order (1 -> 2 -> 3) with the appropriate count(s) and local
+  // latencies.
+  const auto &R = Trie.getRoots();
+  ASSERT_FALSE(R.empty());
+  ASSERT_EQ(R.size(), 1u);
+
+  const auto &F1 = *R[0];
+  ASSERT_EQ(F1.FId, 1);
+  ASSERT_FALSE(F1.Callees.empty());
+
+  const auto &F2 = *F1.Callees[0].NodePtr;
+  ASSERT_EQ(F2.FId, 2);
+  ASSERT_FALSE(F2.Callees.empty());
+
+  const auto &F3 = *F2.Callees[0].NodePtr;
+  ASSERT_EQ(F3.FId, 3);
+  ASSERT_TRUE(F3.Callees.empty());
+
+  // Now that we've established the preconditions, we check for specific aspects
+  // of the nodes.
+  EXPECT_EQ(F3.CallCount, 1);
+  EXPECT_EQ(F2.CallCount, 1);
+  EXPECT_EQ(F1.CallCount, 1);
+  EXPECT_EQ(F3.CumulativeLocalTime, 100);
+  EXPECT_EQ(F2.CumulativeLocalTime, 300);
+  EXPECT_EQ(F1.CumulativeLocalTime, 100);
+}
+
+// TODO: Test that we can handle cross-CPU migrations, where TSCs are not
+// guaranteed to be synchronised.
+TEST(FunctionCallTrieTest, DeepCopy) {
+  profilerFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  Trie.enterFunction(1, 0);
+  Trie.enterFunction(2, 1);
+  Trie.exitFunction(2, 2);
+  Trie.enterFunction(3, 3);
+  Trie.exitFunction(3, 4);
+  Trie.exitFunction(1, 5);
+
+  // We want to make a deep copy and compare notes.
+  auto B = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Copy(B);
+  Trie.deepCopyInto(Copy);
+
+  ASSERT_NE(Trie.getRoots().size(), 0u);
+  ASSERT_EQ(Trie.getRoots().size(), Copy.getRoots().size());
+  const auto &R0Orig = *Trie.getRoots()[0];
+  const auto &R0Copy = *Copy.getRoots()[0];
+  EXPECT_EQ(R0Orig.FId, 1);
+  EXPECT_EQ(R0Orig.FId, R0Copy.FId);
+
+  ASSERT_EQ(R0Orig.Callees.size(), 2u);
+  ASSERT_EQ(R0Copy.Callees.size(), 2u);
+
+  const auto &F1Orig =
+      *R0Orig.Callees
+           .find_element(
+               [](const FunctionCallTrie::NodeRef &R) { return R.FId == 2; })
+           ->NodePtr;
+  const auto &F1Copy =
+      *R0Copy.Callees
+           .find_element(
+               [](const FunctionCallTrie::NodeRef &R) { return R.FId == 2; })
+           ->NodePtr;
+  EXPECT_EQ(&R0Orig, F1Orig.Parent);
+  EXPECT_EQ(&R0Copy, F1Copy.Parent);
+}
+
+TEST(FunctionCallTrieTest, MergeInto) {
+  profilerFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie T0(A);
+  FunctionCallTrie T1(A);
+
+  // 1 -> 2 -> 3
+  T0.enterFunction(1, 0);
+  T0.enterFunction(2, 1);
+  T0.enterFunction(3, 2);
+  T0.exitFunction(3, 3);
+  T0.exitFunction(2, 4);
+  T0.exitFunction(1, 5);
+
+  // 1 -> 2 -> 3
+  T1.enterFunction(1, 0);
+  T1.enterFunction(2, 1);
+  T1.enterFunction(3, 2);
+  T1.exitFunction(3, 3);
+  T1.exitFunction(2, 4);
+  T1.exitFunction(1, 5);
+
+  auto B = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Merged(B);
+
+  T0.mergeInto(Merged);
+  T1.mergeInto(Merged);
+
+  ASSERT_EQ(Merged.getRoots().size(), 1u);
+  const auto &R0 = *Merged.getRoots()[0];
+  EXPECT_EQ(R0.FId, 1);
+  EXPECT_EQ(R0.CallCount, 2);
+  EXPECT_EQ(R0.CumulativeLocalTime, 10);
+}
+
+} // namespace
+
+} // namespace __xray
Index: compiler-rt/lib/xray/tests/unit/profile_collector_test.cc
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/tests/unit/profile_collector_test.cc
@@ -0,0 +1,111 @@
+//===-- function_call_trie_test.cc ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#include "gtest/gtest.h"
+
+#include "xray_profile_collector.h"
+#include "xray_profiler_flags.h"
+#include <thread>
+
+namespace __xray {
+namespace {
+
+static constexpr auto kHeaderSize = 16u;
+
+void ValidateBlock(XRayBuffer B) {
+  profilerFlags()->setDefaults();
+  ASSERT_NE(static_cast<const void *>(B.Data), nullptr);
+  ASSERT_NE(B.Size, 0u);
+  ASSERT_GE(B.Size, kHeaderSize);
+  // We look at the block size, the block number, and the thread ID to ensure
+  // that none of them are zero (or that the header data is laid out as we
+  // expect).
+  char LocalBuffer[kHeaderSize] = {};
+  memcpy(LocalBuffer, B.Data, kHeaderSize);
+  u32 BlockSize = 0;
+  u32 BlockNumber = 0;
+  u64 ThreadId = 0;
+  memcpy(&BlockSize, LocalBuffer, sizeof(u32));
+  memcpy(&BlockNumber, LocalBuffer + sizeof(u32), sizeof(u32));
+  memcpy(&ThreadId, LocalBuffer + (2 * sizeof(u32)), sizeof(u64));
+  EXPECT_NE(BlockSize, 0u);
+  EXPECT_GE(BlockNumber, 0u);
+  EXPECT_NE(ThreadId, 0u);
+}
+
+TEST(profileCollectorServiceTest, PostSerializeCollect) {
+  profilerFlags()->setDefaults();
+  // The most basic use-case (the one we actually only care about) is the one
+  // where we ensure that we can post FunctionCallTrie instances, which are then
+  // destroyed but serialized properly.
+  //
+  // First, we initialise a set of allocators in the local scope. This ensures
+  // that we're able to copy the contents of the FunctionCallTrie that uses
+  // the local allocators.
+  auto Allocators = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(Allocators);
+
+  // Then, we populate the trie with some data.
+  Trie.enterFunction(1, 1);
+  Trie.enterFunction(2, 2);
+  Trie.exitFunction(2, 3);
+  Trie.exitFunction(1, 4);
+
+  // Then we post the data to the global profile collector service.
+  profileCollectorService::post(Trie, 1);
+
+  // Then we serialize the data.
+  profileCollectorService::serialize();
+
+  // Then we go through a single buffer to see whether we're getting the data we
+  // expect.
+  auto B = profileCollectorService::nextBuffer({nullptr, 0});
+  ValidateBlock(B);
+}
+
+// We break out a function that will be run in multiple threads, one that will
+// use a thread local allocator, and will post the FunctionCallTrie to the
+// profileCollectorService. This simulates what the threads being profiled would
+// be doing anyway, but through the XRay logging implementation.
+void threadProcessing() {
+  thread_local auto Allocators = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(Allocators);
+
+  Trie.enterFunction(1, 1);
+  Trie.enterFunction(2, 2);
+  Trie.exitFunction(2, 3);
+  Trie.exitFunction(1, 4);
+
+  profileCollectorService::post(Trie, __sanitizer::GetTid());
+}
+
+TEST(profileCollectorServiceTest, PostSerializeCollectMultipleThread) {
+  profilerFlags()->setDefaults();
+  std::thread t1(threadProcessing);
+  std::thread t2(threadProcessing);
+
+  t1.join();
+  t2.join();
+
+  // At this point, t1 and t2 are already done with what they were doing.
+  profileCollectorService::serialize();
+
+  // Ensure that we see two buffers.
+  auto B = profileCollectorService::nextBuffer({nullptr, 0});
+  ValidateBlock(B);
+
+  B = profileCollectorService::nextBuffer(B);
+  ValidateBlock(B);
+}
+
+} // namespace
+} // namespace __xray
Index: compiler-rt/lib/xray/tests/unit/segmented_array_test.cc
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/tests/unit/segmented_array_test.cc
@@ -0,0 +1,91 @@
+#include "xray_segmented_array.h"
+#include "gtest/gtest.h"
+
+namespace __xray {
+namespace {
+
+struct TestData {
+  s64 First;
+  s64 Second;
+
+  // Need a constructor for emplace operations.
+  TestData(s64 F, s64 S) : First(F), Second(S) {}
+};
+
+TEST(SegmentedArrayTest, Construction) {
+  Array<TestData> Data;
+  (void)Data;
+}
+
+TEST(SegmentedArrayTest, ConstructWithAllocator) {
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4, 0);
+  Array<TestData> Data(A);
+  (void)Data;
+}
+
+TEST(SegmentedArrayTest, ConstructAndPopulate) {
+  Array<TestData> data;
+  ASSERT_NE(data.Append(TestData{0, 0}), nullptr);
+  ASSERT_NE(data.Append(TestData{1, 1}), nullptr);
+  ASSERT_EQ(data.size(), 2u);
+}
+
+TEST(SegmentedArrayTest, ConstructPopulateAndLookup) {
+  Array<TestData> data;
+  ASSERT_NE(data.Append(TestData{0, 1}), nullptr);
+  ASSERT_EQ(data.size(), 1u);
+  ASSERT_EQ(data[0].First, 0);
+  ASSERT_EQ(data[0].Second, 1);
+}
+
+TEST(SegmentedArrayTest, PopulateWithMoreElements) {
+  Array<TestData> data;
+  static const auto kMaxElements = 100u;
+  for (auto I = 0u; I < kMaxElements; ++I) {
+    ASSERT_NE(data.Append(TestData{I, I + 1}), nullptr);
+  }
+  ASSERT_EQ(data.size(), kMaxElements);
+  for (auto I = 0u; I < kMaxElements; ++I) {
+    ASSERT_EQ(data[I].First, I);
+    ASSERT_EQ(data[I].Second, I + 1);
+  }
+}
+
+TEST(SegmentedArrayTest, AppendEmplace) {
+  Array<TestData> data;
+  ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
+  ASSERT_EQ(data[0].First, 1);
+  ASSERT_EQ(data[0].Second, 1);
+}
+
+TEST(SegmentedArrayTest, AppendAndTrim) {
+  Array<TestData> data;
+  ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
+  ASSERT_EQ(data.size(), 1u);
+  data.trim(1);
+  ASSERT_EQ(data.size(), 0u);
+  ASSERT_TRUE(data.empty());
+}
+
+TEST(SegmentedArrayTest, Iterators) {
+  Array<TestData> data;
+  ASSERT_TRUE(data.empty());
+  ASSERT_EQ(data.begin(), data.end());
+  auto I0 = data.begin();
+  ASSERT_EQ(I0++, data.begin());
+  ASSERT_NE(I0, data.begin());
+  for (const auto &D : data) {
+    (void)D;
+    FAIL();
+  }
+  ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
+  ASSERT_EQ(data.size(), 1u);
+  ASSERT_NE(data.begin(), data.end());
+  auto &D0 = *data.begin();
+  ASSERT_EQ(D0.First, 1);
+  ASSERT_EQ(D0.Second, 1);
+}
+
+} // namespace
+} // namespace __xray
Index: compiler-rt/lib/xray/xray_allocator.h
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/xray_allocator.h
@@ -0,0 +1,175 @@
+//===-- xray_allocator.h ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Defines the allocator interface for an arena allocator, used primarily for
+// the profiling runtime.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_ALLOCATOR_H
+#define XRAY_ALLOCATOR_H
+
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_mutex.h"
+#include <cstddef>
+#include <cstdint>
+
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+/// The Allocator type hands out fixed-sized chunks of memory that are
+/// cache-line aligned and sized. This is useful for placement of
+/// performance-sensitive data in memory that's frequently accessed. The
+/// allocator also self-limits the peak memory usage to a dynamically defined
+/// maximum.
+///
+template <size_t N> struct Allocator {
+  // Each Block will have a fixed size, dependent on N. The allocator will hand
+  // out individual blocks identified by the Data pointer, which is associated
+  // with a block.
+  struct Block {
+    /// Compute the minimum cache-line size multiple that is >= N.
+    static constexpr auto Size =
+        kCacheLineSize * ((N / kCacheLineSize) + (N % kCacheLineSize ? 1 : 0));
+    void *Data = nullptr;
+  };
+
+private:
+  // A BlockChain will contain a fixed number of blocks, each with an identifier
+  // to specify whether it's been handed out or not. We keep track of BlockChain
+  // iterators, which are basically a pointer to the chain and an offset into
+  // the fixed set of blocks associated with a link. The iterators are
+  // bidirectional.
+  struct BlockChain {
+    static_assert(kCacheLineSize % sizeof(void *) == 0,
+                  "Cache line size is not divisible by size of void*; none of "
+                  "the assumptions of the BlockChain will hold.");
+
+    // This size represents the most number of pointers we can fit in a cache
+    // line, along with the next and previous pointers, and the bitmap to
+    // indicate which blocks have been handed out, and which ones are available.
+    // This structure corresponds to the following layout:
+    //
+    //   Blocks [ 0, 1, 2, .., Size - 1]
+    //
+    // Using a 16-bit bitmap will allow us to cover a really big cache line that
+    // might store up to 16 pointers.
+    static constexpr auto Size = (kCacheLineSize / sizeof(Block)) - 3;
+    static_assert(Size <= 16, "Need to support larger than 16 ");
+
+    // FIXME: Align this to cache-line address boundaries
+    Block Blocks[Size]{};
+    BlockChain *Prev = nullptr;
+    BlockChain *Next = nullptr;
+    u16 UsedBits = 0;
+  };
+
+  static_assert(sizeof(BlockChain) == kCacheLineSize,
+                "BlockChain instances must be cache-line-sized.");
+
+  static BlockChain NullLink;
+
+  // FIXME: Implement a freelist, in case we actually do intend to return memory
+  // to the allocator, as opposed to just de-allocating everything in one go.
+
+  size_t MaxMemory;
+  SpinMutex Mutex{};
+  BlockChain *Chain = &NullLink;
+  size_t Counter = 0;
+
+  BlockChain *NewChainLink() {
+    auto NewChain = reinterpret_cast<BlockChain *>(
+        InternalAlloc(sizeof(BlockChain), nullptr, kCacheLineSize));
+    auto BackingStore = reinterpret_cast<char *>(
+        InternalAlloc(BlockChain::Size * Block::Size, nullptr, kCacheLineSize));
+    size_t Offset = 0;
+    DCHECK_NE(NewChain, nullptr);
+    DCHECK_NE(BackingStore, nullptr);
+    for (auto &B : NewChain->Blocks) {
+      B.Data = BackingStore + Offset;
+      Offset += Block::Size;
+    }
+    NewChain->Prev = Chain;
+    NewChain->Next = &NullLink;
+    return NewChain;
+  }
+
+public:
+  Allocator(size_t M, size_t PreAllocate) : MaxMemory(M) {
+    // FIXME: Implement PreAllocate support!
+  }
+
+  Block Allocate() {
+    SpinMutexLock Lock(&Mutex);
+    // Check whether we're over quota.
+    if (Counter * Block::Size >= MaxMemory)
+      return {};
+
+    size_t ChainOffset = Counter % BlockChain::Size;
+    u16 BitMask = 1 << ChainOffset;
+
+    if (UNLIKELY(Chain == &NullLink)) {
+      // FIXME: Maybe pre-allocate some BlockChain instances in the same page
+      // as the Block objects.
+      auto NewChain = NewChainLink();
+
+      // Mark the first block in the newly crated link to "used".
+      NewChain->UsedBits |= BitMask;
+
+      // Prepare to return the first block.
+      auto B = NewChain->Blocks[ChainOffset];
+      Chain->Next = NewChain;
+      Chain = NewChain;
+      ++Counter;
+      return B;
+    }
+
+    // We need to see whether we can get one of the blocks already in the chain
+    // link can be used.
+    if ((Chain->UsedBits | BitMask) == 0) {
+      auto B = Chain->Blocks[ChainOffset];
+      Chain->UsedBits |= BitMask;
+      ++Counter;
+      return B;
+    }
+
+    // At this point we know that we've wrapped around, and the current chain no
+    // longer has free blocks. We then allocate a new chain and make that the
+    // current chain.
+    auto NewChain = NewChainLink();
+    NewChain->UsedBits |= BitMask;
+    auto B = NewChain->Blocks[ChainOffset];
+    Chain->Next = NewChain;
+    Chain = NewChain;
+    ++Counter;
+    return B;
+  }
+
+  ~Allocator() NOEXCEPT {
+    // We need to deallocate all the blocks, including the chain links.
+    for (auto C = Chain; C != &NullLink;) {
+      // We know that the data block is a large contiguous page, we deallocate
+      // that at once.
+      InternalFree(C->Blocks[0].Data);
+      auto Prev = C->Prev;
+      InternalFree(C);
+      C = Prev;
+    }
+  }
+};
+
+// Storage for the NullLink sentinel.
+template <size_t N> typename Allocator<N>::BlockChain Allocator<N>::NullLink;
+
+} // namespace __xray
+
+#endif // XRAY_ALLOCATOR_H
Index: compiler-rt/lib/xray/xray_function_call_trie.h
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/xray_function_call_trie.h
@@ -0,0 +1,431 @@
+//===-- xray_function_call_trie.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This file defines the interface for a function call trie.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FUNCTION_CALL_TRIE_H
+#define XRAY_FUNCTION_CALL_TRIE_H
+
+#include "xray_profiler_flags.h"
+#include "xray_segmented_array.h"
+#include <utility>
+
+namespace __xray {
+
+/// A FunctionCallTrie represents the stack traces of XRay instrumented
+/// functions that we've encountered, where a node corresponds to a function and
+/// the path from the root to the node its stack trace. Each node in the trie
+/// will contain some useful values, including:
+///
+///   * The cumulative amount of time spent in this particular node/stack.
+///   * The number of times this stack has appeared.
+///   * A histogram of latencies for that particular node.
+///
+/// Each node in the trie will also contain a list of callees, represented using
+/// a Array<NodeRef> -- each NodeRef instance will contain the function
+/// ID of the callee, and a pointer to the node.
+///
+/// If we visulaise this data structure, we'll find the following potential
+/// representation:
+///
+///   [function id node] -> [callees] [cumulative time]
+///                         [call counter] [latency histogram]
+///
+/// As an example, when we have a function in this pseudocode:
+///
+///   func f(N) {
+///     g()
+///     h()
+///     for i := 1..N { j() }
+///   }
+///
+/// We may end up with a trie of the following form:
+///
+///   f -> [ g, h, j ] [...] [1] [...]
+///   g -> [ ... ] [...] [1] [...]
+///   h -> [ ... ] [...] [1] [...]
+///   j -> [ ... ] [...] [N] [...]
+///
+/// If for instance the function g() called j() like so:
+///
+///   func g() {
+///     for i := 1..10 { j() }
+///   }
+///
+/// We'll find the following updated trie:
+///
+///   f -> [ g, h, j ] [...] [1] [...]
+///   g -> [ j' ] [...] [1] [...]
+///   h -> [ ... ] [...] [1] [...]
+///   j -> [ ... ] [...] [N] [...]
+///   j' -> [ ... ] [...] [10] [...]
+///
+/// Note that we'll have a new node representing the path `f -> g -> j'` with
+/// isolated data. This isolation gives us a means of representing the stack
+/// traces as a path, as opposed to a key in a table. The alternative
+/// implementation here would be to use a separate table for the path, and use
+/// hashes of the path as an identifier to accumuate the information. We've
+/// moved away from this approach as it takes a lot of time to compute the hash
+/// every time we need to update a function's call information as we're handling
+/// the entry and exit events.
+///
+/// This approach allows us to maintain a shadow stack, which represents the
+/// currently executing path, and on function exits quickly compute the amount
+/// of time elapsed from the entry, then update the counters for the node
+/// already represented in the trie. This necessitates an efficient
+/// representation of the various data structures (the list of callees must be
+/// cache-aware and efficient to look up, and the histogram must be compact and
+/// quick to update) to enable us to keep the overheads of this implementation
+/// to the minimum.
+class FunctionCallTrie {
+public:
+  struct Node;
+
+  // We use a NodeRef type instead of a std::pair<...> to not rely on the
+  // standard library types in this header.
+  struct NodeRef {
+    Node *NodePtr;
+    int32_t FId;
+
+    // Constructor for inplace-construction.
+    NodeRef(Node *N, int32_t F) : NodePtr(N), FId(F) {}
+  };
+
+  using NodeRefArray = Array<NodeRef>;
+  using NodeRefAllocatorType = NodeRefArray::AllocatorType;
+
+  // A Node in the FunctionCallTrie gives us a list of callees, the cumulative
+  // number of times this node actually appeared, the cumulative amount of time
+  // for this particular node including its children call times, and just the
+  // local time spent on this node. Each Node will have the ID of the XRay
+  // instrumented function that it is associated to.
+  struct Node {
+    Node *Parent;
+    NodeRefArray Callees;
+    int64_t CallCount;
+    int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
+    int32_t FId;
+
+    // We add a constructor here to allow us to inplace-construct through
+    // Array<...>'s AppendEmplace.
+    Node(Node *P, NodeRefAllocatorType &A, int64_t CC, int64_t CLT, int32_t F)
+        : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT),
+          FId(F) {}
+
+    // TODO: Include the compact histogram.
+  };
+
+private:
+  struct ShadowStackEntry {
+    int32_t FId; // We're copying the function ID into the stack to avoid having
+                 // to reach into the node just to get the function ID.
+    uint64_t EntryTSC;
+    Node *NodePtr;
+
+    // We add a constructor here to allow us to inplace-construct through
+    // Array<...>'s AppendEmplace.
+    ShadowStackEntry(int32_t F, uint64_t T, Node *N)
+        : FId(F), EntryTSC(T), NodePtr(N) {}
+  };
+
+  using NodeArray = Array<Node>;
+  using RootArray = Array<Node *>;
+  using ShadowStackArray = Array<ShadowStackEntry>;
+
+public:
+  // We collate the allocators we need into a single struct, as a convenience to
+  // allow us to initialize these as a group.
+  struct Allocators {
+    using NodeAllocatorType = NodeArray::AllocatorType;
+    using RootAllocatorType = RootArray::AllocatorType;
+    using ShadowStackAllocatorType = ShadowStackArray::AllocatorType;
+    using NodeRefAllocatorType = NodeRefAllocatorType;
+
+    NodeAllocatorType *NodeAllocator = nullptr;
+    RootAllocatorType *RootAllocator = nullptr;
+    ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
+    NodeRefAllocatorType *NodeRefAllocator = nullptr;
+
+    Allocators() {}
+    Allocators(const Allocators &) = delete;
+    Allocators &operator=(const Allocators &) = delete;
+
+    Allocators(Allocators &&O)
+        : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator),
+          ShadowStackAllocator(O.ShadowStackAllocator),
+          NodeRefAllocator(O.NodeRefAllocator) {
+      O.NodeAllocator = nullptr;
+      O.RootAllocator = nullptr;
+      O.ShadowStackAllocator = nullptr;
+      O.NodeRefAllocator = nullptr;
+    }
+
+    Allocators &operator=(Allocators &&O) {
+      {
+        auto Tmp = O.NodeAllocator;
+        O.NodeAllocator = this->NodeAllocator;
+        this->NodeAllocator = Tmp;
+      }
+      {
+        auto Tmp = O.RootAllocator;
+        O.RootAllocator = this->RootAllocator;
+        this->RootAllocator = Tmp;
+      }
+      {
+        auto Tmp = O.ShadowStackAllocator;
+        O.ShadowStackAllocator = this->ShadowStackAllocator;
+        this->ShadowStackAllocator = Tmp;
+      }
+      {
+        auto Tmp = O.NodeRefAllocator;
+        O.NodeRefAllocator = this->NodeRefAllocator;
+        this->NodeRefAllocator = Tmp;
+      }
+      return *this;
+    }
+
+    ~Allocators() {
+      // Note that we cannot use delete on these pointers, as they need to be
+      // returned to the sanitizer_common library's internal memory tracking
+      // system.
+      if (NodeAllocator != nullptr) {
+        NodeAllocator->~NodeAllocatorType();
+        InternalFree(NodeAllocator);
+      }
+      if (RootAllocator != nullptr) {
+        RootAllocator->~RootAllocatorType();
+        InternalFree(RootAllocator);
+      }
+      if (ShadowStackAllocator != nullptr) {
+        ShadowStackAllocator->~ShadowStackAllocatorType();
+        InternalFree(ShadowStackAllocator);
+      }
+      if (NodeRefAllocator != nullptr) {
+        NodeRefAllocator->~NodeRefAllocatorType();
+        InternalFree(NodeRefAllocator);
+      }
+    }
+  }; // namespace __xray
+
+  // TODO: Support configuration of options through the arguments.
+  static Allocators InitAllocators() {
+    Allocators A;
+    auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>(
+        InternalAlloc(sizeof(Allocators::NodeAllocatorType)));
+    new (NodeAllocator) Allocators::NodeAllocatorType(
+        profilerFlags()->xray_profiling_per_thread_allocator_max, 0);
+    A.NodeAllocator = NodeAllocator;
+
+    auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>(
+        InternalAlloc(sizeof(Allocators::RootAllocatorType)));
+    new (RootAllocator) Allocators::RootAllocatorType(
+        profilerFlags()->xray_profiling_per_thread_allocator_max, 0);
+    A.RootAllocator = RootAllocator;
+
+    auto ShadowStackAllocator =
+        reinterpret_cast<Allocators::ShadowStackAllocatorType *>(
+            InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType)));
+    new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(
+        profilerFlags()->xray_profiling_per_thread_allocator_max, 0);
+    A.ShadowStackAllocator = ShadowStackAllocator;
+
+    auto NodeRefAllocator =
+        reinterpret_cast<Allocators::NodeRefAllocatorType *>(
+            InternalAlloc(sizeof(Allocators::NodeRefAllocatorType)));
+    new (NodeRefAllocator) Allocators::NodeRefAllocatorType(
+        profilerFlags()->xray_profiling_per_thread_allocator_max, 0);
+    A.NodeRefAllocator = NodeRefAllocator;
+    return A;
+  }
+
+private:
+  NodeArray Nodes;
+  RootArray Roots;
+  ShadowStackArray ShadowStack;
+  NodeRefAllocatorType *NodeRefAllocator = nullptr;
+
+  const Allocators &GetGlobalAllocators() {
+    static const Allocators A = [] { return InitAllocators(); }();
+    return A;
+  }
+
+public:
+  explicit FunctionCallTrie(const Allocators &A)
+      : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
+        ShadowStack(*A.ShadowStackAllocator),
+        NodeRefAllocator(A.NodeRefAllocator) {}
+
+  FunctionCallTrie() : FunctionCallTrie(GetGlobalAllocators()) {}
+
+  void enterFunction(int32_t FId, uint64_t TSC) {
+    // This function primarily deals with ensuring that the ShadowStack is
+    // consistent and ready for when an exit event is encountered.
+    if (UNLIKELY(ShadowStack.empty())) {
+      auto NewRoot = Nodes.AppendEmplace(nullptr, *NodeRefAllocator, 0, 0, FId);
+      if (UNLIKELY(NewRoot == nullptr))
+        return;
+      Roots.Append(NewRoot);
+      ShadowStack.AppendEmplace(FId, TSC, NewRoot);
+      return;
+    }
+
+    auto &Top = ShadowStack.back();
+    auto TopNode = Top.NodePtr;
+
+    // If we've seen this callee before, then we just access that node and place
+    // that on the top of the stack.
+    auto Callee = TopNode->Callees.find_element(
+        [FId](const NodeRef &NR) { return NR.FId == FId; });
+    if (Callee != nullptr) {
+      CHECK_NE(Callee->NodePtr, nullptr);
+      ShadowStack.AppendEmplace(FId, TSC, Callee->NodePtr);
+      return;
+    }
+
+    // This means we've never seen this stack before, create a new node here.
+    auto NewNode = Nodes.AppendEmplace(TopNode, *NodeRefAllocator, 0, 0, FId);
+    if (UNLIKELY(NewNode == nullptr))
+      return;
+    TopNode->Callees.AppendEmplace(NewNode, FId);
+    ShadowStack.AppendEmplace(FId, TSC, NewNode);
+    return;
+  }
+
+  void exitFunction(int32_t FId, uint64_t TSC) {
+    // When we exit a function, we look up the ShadowStack to see whether we've
+    // entered this function before. We do as little processing here as we can,
+    // since most of the hard work would have already been done at function
+    // entry.
+    if (UNLIKELY(ShadowStack.empty()))
+      return;
+
+    uint64_t CumulativeTreeTime = 0;
+    while (!ShadowStack.empty()) {
+      auto &Top = ShadowStack.back();
+      auto TopNode = Top.NodePtr;
+      auto TopFId = TopNode->FId;
+      auto LocalTime = TSC - Top.EntryTSC;
+      TopNode->CallCount++;
+      TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
+      CumulativeTreeTime += LocalTime;
+      ShadowStack.trim(1);
+
+      // TODO: Update the histogram for the node.
+      if (TopFId == FId)
+        break;
+    }
+  }
+
+  const RootArray &getRoots() const { return Roots; }
+
+  // The deepCopyInto operation will update the provided FunctionCallTrie by
+  // re-creating the contents of this particular FunctionCallTrie in the other
+  // FunctionCallTrie. It will do this using a Depth First Traversal from the
+  // roots, and while doing so recreating the traversal in the provided
+  // FunctionCallTrie.
+  //
+  // This operation will *not* destroy the state in `O`, and thus may cause some
+  // duplicate entries in `O` if it is not empty.
+  void deepCopyInto(FunctionCallTrie &O) const {
+    for (const auto Root : getRoots()) {
+      // Add a node in O for this root.
+      auto NewRoot =
+          O.Nodes.AppendEmplace(nullptr, *O.NodeRefAllocator, Root->CallCount,
+                                Root->CumulativeLocalTime, Root->FId);
+      O.Roots.Append(NewRoot);
+
+      // We then push the root into a stack, to use as the parent marker for new
+      // nodes we push in as we're traversing depth-first down the call tree.
+      struct NodeAndParent {
+        FunctionCallTrie::Node *Node;
+        FunctionCallTrie::Node *NewNode;
+      };
+      using Stack = Array<NodeAndParent>;
+
+      typename Stack::AllocatorType StackAllocator(
+          profilerFlags()->xray_profiling_stack_allocator_max, 0);
+      Stack DFSStack(StackAllocator);
+      DFSStack.Append(NodeAndParent{Root, NewRoot});
+      while (!DFSStack.empty()) {
+        NodeAndParent NP = DFSStack.back();
+        DCHECK_NE(NP.Node, nullptr);
+        DCHECK_NE(NP.NewNode, nullptr);
+        DFSStack.trim(1);
+        for (const auto Callee : NP.Node->Callees) {
+          auto NewNode = O.Nodes.AppendEmplace(
+              NP.NewNode, *O.NodeRefAllocator, Callee.NodePtr->CallCount,
+              Callee.NodePtr->CumulativeLocalTime, Callee.FId);
+          DCHECK_NE(NewNode, nullptr);
+          NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
+          DFSStack.Append(NodeAndParent{Callee.NodePtr, NewNode});
+        }
+      }
+    }
+  }
+
+  // The mergeInto operation will update the provided FunctionCallTrie by
+  // traversing the current trie's roots and updating (i.e. merging) the data in
+  // the nodes with the data in the target's nodes. If the node doesn't exist in
+  // the provided trie, we add a new one in the right position, and inherit the
+  // data from the original (current) trie, along with all its callees.
+  void mergeInto(FunctionCallTrie &O) const {
+    for (const auto Root : getRoots()) {
+      Node *TargetRoot = nullptr;
+      auto R = O.Roots.find_element(
+          [&](const Node *Node) { return Node->FId == Root->FId; });
+      if (R == nullptr) {
+        TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeRefAllocator, 0, 0,
+                                           Root->FId);
+        O.Roots.Append(TargetRoot);
+      } else {
+        TargetRoot = *R;
+      }
+
+      struct NodeAndTarget {
+        FunctionCallTrie::Node *OrigNode;
+        FunctionCallTrie::Node *TargetNode;
+      };
+      using Stack = Array<NodeAndTarget>;
+
+      typename Stack::AllocatorType StackAllocator(
+          profilerFlags()->xray_profiling_stack_allocator_max, 0);
+      Stack DFSStack(StackAllocator);
+      DFSStack.Append(NodeAndTarget{Root, TargetRoot});
+      while (!DFSStack.empty()) {
+        NodeAndTarget NT = DFSStack.back();
+        DCHECK_NE(NT.OrigNode, nullptr);
+        DCHECK_NE(NT.TargetNode, nullptr);
+        DFSStack.trim(1);
+        NT.TargetNode->CallCount += NT.OrigNode->CallCount;
+        NT.TargetNode->CumulativeLocalTime += NT.OrigNode->CumulativeLocalTime;
+        for (const auto Callee : NT.OrigNode->Callees) {
+          auto TargetCallee = NT.TargetNode->Callees.find_element(
+              [&](const FunctionCallTrie::NodeRef &C) {
+                return C.FId == Callee.FId;
+              });
+          if (TargetCallee == nullptr) {
+            auto NewTargetNode = O.Nodes.AppendEmplace(
+                NT.TargetNode, *O.NodeRefAllocator, 0, 0, Callee.FId);
+            TargetCallee =
+                NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId);
+          }
+          DFSStack.Append(NodeAndTarget{Callee.NodePtr, TargetCallee->NodePtr});
+        }
+      }
+    }
+  }
+
+}; // namespace __xray
+
+} // namespace __xray
+
+#endif // XRAY_FUNCTION_CALL_TRIE_H
Index: compiler-rt/lib/xray/xray_profile_collector.h
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/xray_profile_collector.h
@@ -0,0 +1,88 @@
+//===-- xray_profile_collector.h -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This file defines the interface for a data collection service, for XRay
+// profiling. What we implement here is an in-process service where
+// FunctionCallTrie instances can be handed off by threads, to be
+// consolidated/collected.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_XRAY_PROFILE_COLLECTOR_H
+#define XRAY_XRAY_PROFILE_COLLECTOR_H
+
+#include "xray_function_call_trie.h"
+
+#include "xray/xray_log_interface.h"
+
+namespace __xray {
+
+/// The ProfileCollectorService implements a centralised mechanism for
+/// collecting FunctionCallTrie instances, indexed by thread ID. On demand, the
+/// ProfileCollectorService can be queried for the most recent state of the
+/// data, in a form that allows traversal.
+namespace profileCollectorService {
+
+/// Posts the FunctionCallTrie assocaited with a specific Thread ID. This
+/// will:
+///
+///   - Make a copy of the FunctionCallTrie and store that against the Thread
+///     ID. This will use the global allocator for the service-managed
+///     FunctionCallTrie instances.
+///   - Queue up a pointer to the FunctionCallTrie.
+///   - If the queue is long enough (longer than some arbitrary threshold) we
+///     then pre-calculate a single FunctionCallTrie for the whole process.
+///
+///
+/// We are making a copy of the FunctionCallTrie because the intent is to have
+/// this function be called at thread exit, or soon after the profiling
+/// handler is finalized through the XRay APIs. By letting threads each
+/// process their own thread-local FunctionCallTrie instances, we're removing
+/// the need for synchronisation across threads while we're profiling.
+/// However, once we're done profiling, we can then collect copies of these
+/// FunctionCallTrie instances and pay the cost of the copy.
+///
+/// NOTE: In the future, if this turns out to be more costly than "moving" the
+/// FunctionCallTrie instances from the owning thread to the collector
+/// service, then we can change the implementation to do it this way (moving)
+/// instead.
+void post(const FunctionCallTrie &T, tid_t TId);
+
+/// The serialize will process all FunctionCallTrie instances in memory, and
+/// turn those into specifically formatted blocks, each describing the
+/// function call trie's contents in a compact form. In memory, this looks
+/// like the following layout:
+///
+///   - block size (32 bits)
+///   - block number (32 bits)
+///   - thread id (64 bits)
+///   - list of records:
+///     - function ids in reverse call order, from leaf to root, terminated by
+///       0 (32 bits per function id)
+///     - call count (64 bit)
+///     - cumulative local time (64 bit)
+///     - record delimiter (64 bit, 0x0)
+///
+void serialize();
+
+/// The reset function will clear out any internal memory held by the
+/// service. The intent is to have the resetting be done in calls to the
+/// initialization routine, or explicitly through the flush log API.
+void reset();
+
+/// This nextBuffer function is meant to implement the iterator functionality,
+/// provided in the XRay API.
+XRayBuffer nextBuffer(XRayBuffer B);
+
+}; // namespace profileCollectorService
+
+} // namespace __xray
+
+#endif // XRAY_XRAY_PROFILE_COLLECTOR_H
Index: compiler-rt/lib/xray/xray_profile_collector.cc
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/xray_profile_collector.cc
@@ -0,0 +1,275 @@
+//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the interface for the profileCollectorService.
+//
+//===----------------------------------------------------------------------===//
+#include <memory>
+
+#include "xray_profile_collector.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_vector.h"
+#include "xray_profiler_flags.h"
+#include <utility>
+
+namespace __xray {
+namespace profileCollectorService {
+
+namespace {
+
+SpinMutex GlobalMutex;
+struct ThreadTrie {
+  tid_t TId;
+  FunctionCallTrie *Trie;
+};
+Vector<ThreadTrie> ThreadTries;
+
+struct ProfileBuffer {
+  void *Data;
+  size_t Size;
+};
+Vector<ProfileBuffer> ProfileBuffers;
+
+struct BlockHeader {
+  u32 BlockSize;
+  u32 BlockNum;
+  u64 ThreadId;
+};
+
+FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+
+} // namespace
+
+void post(const FunctionCallTrie &T, tid_t TId) {
+  static const bool UNUSED Once = [] {
+    SpinMutexLock Lock(&GlobalMutex);
+    GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
+        InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+    new (GlobalAllocators) FunctionCallTrie::Allocators();
+    *GlobalAllocators = FunctionCallTrie::InitAllocators();
+    return false;
+  }();
+  DCHECK_NE(GlobalAllocators, nullptr);
+
+  ThreadTrie *Item = nullptr;
+  {
+    SpinMutexLock Lock(&GlobalMutex);
+    if (GlobalAllocators == nullptr)
+      return;
+
+    Item = ThreadTries.PushBack();
+    Item->TId = TId;
+    Item->Trie = reinterpret_cast<FunctionCallTrie *>(
+        InternalAlloc(sizeof(FunctionCallTrie)));
+    new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
+    DCHECK_NE(Item->Trie, nullptr);
+  }
+  DCHECK_NE(Item, nullptr);
+
+  T.deepCopyInto(*Item->Trie);
+}
+
+using PathArray = Array<int32_t>;
+
+struct ProfileRecord {
+  using PathAllocator = typename PathArray::AllocatorType;
+
+  // The Path in this record is in reverse order.
+  PathArray *Path = nullptr;
+  const FunctionCallTrie::Node *Node = nullptr;
+
+  // Constructor for in-place construction.
+  ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
+      : Path([&] {
+          auto P =
+              reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
+          new (P) PathArray(A);
+          return P;
+        }()),
+        Node(N) {}
+};
+
+namespace {
+
+using ProfileRecordArray = Array<ProfileRecord>;
+
+// We go through a FunctionCallTrie and traverse from the root, in DFS fashion,
+// to generate the path(s) and output the data.
+void populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
+                     const FunctionCallTrie &Trie) {
+  for (const auto R : Trie.getRoots()) {
+    using StackArray = Array<const FunctionCallTrie::Node *>;
+    using StackAllocator = typename StackArray::AllocatorType;
+    StackAllocator StackAlloc(
+        profilerFlags()->xray_profiling_stack_allocator_max, 0);
+    StackArray DFSStack(StackAlloc);
+    DFSStack.Append(R);
+    while (!DFSStack.empty()) {
+      auto Node = DFSStack.back();
+      DFSStack.trim(1);
+      auto Record = PRs.AppendEmplace(PA, Node);
+      DCHECK_NE(Record, nullptr);
+
+      // Traverse the Node's parents and as we're doing so, get the FIds in
+      // the order they appear.
+      for (auto N = Node; N != nullptr; N = N->Parent)
+        Record->Path->Append(N->FId);
+      DCHECK(!Record->Path->empty());
+
+      for (const auto C : Node->Callees)
+        DFSStack.Append(C.NodePtr);
+    }
+  }
+}
+
+void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
+                      const ProfileRecordArray &ProfileRecords) {
+  auto NextPtr = static_cast<char *>(
+                     internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
+                 sizeof(Header);
+  for (const auto &Record : ProfileRecords) {
+    // List of IDs follow:
+    for (const auto FId : *Record.Path)
+      NextPtr =
+          static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
+          sizeof(FId);
+
+    // Add the sentinel here.
+    constexpr int32_t SentinelFId = 0;
+    NextPtr = static_cast<char *>(
+                  internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
+              sizeof(SentinelFId);
+
+    // Add the node data here.
+    NextPtr =
+        static_cast<char *>(internal_memcpy(NextPtr, &Record.Node->CallCount,
+                                            sizeof(Record.Node->CallCount))) +
+        sizeof(Record.Node->CallCount);
+    NextPtr = static_cast<char *>(
+                  internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
+                                  sizeof(Record.Node->CumulativeLocalTime))) +
+              sizeof(Record.Node->CumulativeLocalTime);
+
+    // Add an end of record sentinel here.
+    constexpr uint64_t EndOfRecord = 0x0;
+    NextPtr = static_cast<char *>(
+                  internal_memset(NextPtr, EndOfRecord, sizeof(EndOfRecord))) +
+              sizeof(EndOfRecord);
+  }
+
+  DCHECK_EQ(NextPtr - static_cast<char *>(Buffer->Data), Buffer->Size);
+}
+
+} // namespace
+
+void serialize() {
+  SpinMutexLock Lock(&GlobalMutex);
+
+  // Clear out the global ProfileBuffers.
+  for (uptr I = 0; I < ProfileBuffers.Size(); ++I)
+    InternalFree(ProfileBuffers[I].Data);
+  ProfileBuffers.Reset();
+
+  if (ThreadTries.Size() == 0)
+    return;
+
+  // Then repopulate the global ProfileBuffers.
+  for (u32 I = 0; I < ThreadTries.Size(); ++I) {
+    using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
+    ProfileRecordAllocator PRAlloc(
+        profilerFlags()->xray_profiling_global_allocator_max, 0);
+    ProfileRecord::PathAllocator PathAlloc(
+        profilerFlags()->xray_profiling_global_allocator_max, 0);
+    ProfileRecordArray ProfileRecords(PRAlloc);
+
+    // First, we want to compute the amount of space we're going to need. We'll
+    // use a local allocator and an __xray::Array<...> to store the intermediary
+    // data, then compute the size as we're going along. Then we'll allocate the
+    // contiguous space to contain the thread buffer data.
+    const auto &Trie = *ThreadTries[I].Trie;
+    if (Trie.getRoots().empty())
+      continue;
+    populateRecords(ProfileRecords, PathAlloc, Trie);
+    DCHECK(!Trie.getRoots().empty());
+    DCHECK(!ProfileRecords.empty());
+
+    // Go through each record, to compute the sizes.
+    //
+    // header size = block size (4 bytes)
+    //   + block number (4 bytes)
+    //   + thread id (8 bytes)
+    // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
+    //   + call count (8 bytes)
+    //   + local time (8 bytes)
+    //   + end of record (8 bytes)
+    u32 CumulativeSizes = 0;
+    for (const auto &Record : ProfileRecords)
+      CumulativeSizes += 28 + (4 * Record.Path->size());
+
+    BlockHeader Header{16 + CumulativeSizes, I, ThreadTries[I].TId};
+    auto Buffer = ProfileBuffers.PushBack();
+    Buffer->Size = sizeof(Header) + CumulativeSizes;
+    Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
+    DCHECK_NE(Buffer->Data, nullptr);
+    serializeRecords(Buffer, Header, ProfileRecords);
+
+    // Now clean up the ProfileRecords array, one at a time.
+    for (auto &Record : ProfileRecords) {
+      Record.Path->~PathArray();
+      InternalFree(Record.Path);
+    }
+  }
+}
+
+void reset() {
+  SpinMutexLock Lock(&GlobalMutex);
+  // Clear out the profile buffers that have been serialized.
+  for (uptr I = 0; I < ProfileBuffers.Size(); ++I)
+    InternalFree(ProfileBuffers[I].Data);
+  ProfileBuffers.Reset();
+
+  // Clear out the function call tries per thread.
+  for (uptr I = 0; I < ThreadTries.Size(); ++I) {
+    auto &T = ThreadTries[I];
+    T.Trie->~FunctionCallTrie();
+    InternalFree(T.Trie);
+  }
+  ThreadTries.Reset();
+
+  // Reset the global allocators.
+  if (GlobalAllocators != nullptr) {
+    GlobalAllocators->~Allocators();
+    InternalFree(GlobalAllocators);
+    GlobalAllocators = nullptr;
+  }
+  GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
+      InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+  new (GlobalAllocators) FunctionCallTrie::Allocators();
+  *GlobalAllocators = FunctionCallTrie::InitAllocators();
+}
+
+XRayBuffer nextBuffer(XRayBuffer B) {
+  // FIXME: Find a way to identify the buffers, somehow, from within this
+  // function.
+  SpinMutexLock Lock(&GlobalMutex);
+  if (B.Data == nullptr && ProfileBuffers.Size())
+    return {ProfileBuffers[0].Data, ProfileBuffers[0].Size};
+
+  BlockHeader Header;
+  internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
+  auto NextBlock = Header.BlockNum + 1;
+  if (NextBlock < ProfileBuffers.Size())
+    return {ProfileBuffers[NextBlock].Data, ProfileBuffers[NextBlock].Size};
+  return {nullptr, 0};
+}
+
+} // namespace profileCollectorService
+} // namespace __xray
Index: compiler-rt/lib/xray/xray_profiler.cc
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/xray_profiler.cc
@@ -0,0 +1,296 @@
+//===-- xray_profiling.cc --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This is the implementation of a profiling handler.
+//
+//===----------------------------------------------------------------------===//
+#include <memory>
+
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_flags.h"
+#include "xray/xray_interface.h"
+#include "xray/xray_log_interface.h"
+
+#include "xray_flags.h"
+#include "xray_profile_collector.h"
+#include "xray_profiler_flags.h"
+#include "xray_tsc.h"
+#include "xray_utils.h"
+#include <pthread.h>
+
+namespace __xray {
+
+namespace {
+
+__sanitizer::atomic_sint32_t ProfilerLogFlushStatus = {
+    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
+
+__sanitizer::atomic_sint32_t ProfilerLogStatus = {
+    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
+__sanitizer::SpinMutex ProfilerOptionsMutex;
+
+struct alignas(64) ProfilingTLD {
+  FunctionCallTrie::Allocators *Allocators = nullptr;
+  FunctionCallTrie *FCT = nullptr;
+};
+
+static pthread_key_t ProfilingKey;
+
+ProfilingTLD &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
+  thread_local ProfilingTLD TLD;
+  thread_local bool UNUSED Once = [] {
+    pthread_setspecific(ProfilingKey, &TLD);
+    return false;
+  }();
+
+  // We need to check whether the global flag to finalizing/finalized has been
+  // switched. If it is, then we ought to not actually initialise the data.
+  auto Status = __sanitizer::atomic_load(&ProfilerLogStatus,
+                                         __sanitizer::memory_order_acquire);
+  if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
+      Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)
+    return TLD;
+
+  // If we're live, then we re-initialize TLD if the pointers are not null.
+  if (UNLIKELY(TLD.Allocators == nullptr && TLD.FCT == nullptr)) {
+    TLD.Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
+        InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+    new (TLD.Allocators) FunctionCallTrie::Allocators();
+    *TLD.Allocators = FunctionCallTrie::InitAllocators();
+    TLD.FCT = reinterpret_cast<FunctionCallTrie *>(
+        InternalAlloc(sizeof(FunctionCallTrie)));
+    new (TLD.FCT) FunctionCallTrie(*TLD.Allocators);
+  }
+
+  return TLD;
+}
+
+} // namespace
+
+const char *profilerCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_PROFILER_DEFAULT_OPTIONS
+  return SANITIZER_STRINGIFY(XRAY_PROFILER_DEFAULT_OPTIONS);
+#else
+  return "";
+#endif
+}
+
+__sanitizer::atomic_sint32_t ProfileFlushStatus = {
+    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
+
+XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
+  // When flushing, all we really do is reset the global state, and only when
+  // the log has already been finalized.
+  if (__sanitizer::atomic_load(&ProfilerLogStatus,
+                               __sanitizer::memory_order_acquire) !=
+      XRayLogInitStatus::XRAY_LOG_FINALIZED) {
+    if (__sanitizer::Verbosity())
+      Report("Not flushing profiles, profiler not been finalized.\n");
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  }
+
+  s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  if (!__sanitizer::atomic_compare_exchange_strong(
+          &ProfilerLogFlushStatus, &Result,
+          XRayLogFlushStatus::XRAY_LOG_FLUSHING,
+          __sanitizer::memory_order_release)) {
+    if (__sanitizer::Verbosity())
+      Report("Not flushing profiles, implementation still finalizing.\n");
+  }
+
+  profileCollectorService::reset();
+
+  __sanitizer::atomic_store(&ProfilerLogStatus,
+                            XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+                            __sanitizer::memory_order_release);
+
+  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
+
+namespace {
+
+thread_local volatile bool ReentranceGuard = false;
+
+void postCurrentThreadFCT(ProfilingTLD &TLD) {
+  if (TLD.Allocators == nullptr || TLD.FCT == nullptr)
+    return;
+
+  profileCollectorService::post(*TLD.FCT, GetTid());
+  TLD.FCT->~FunctionCallTrie();
+  TLD.Allocators->~Allocators();
+  InternalFree(TLD.FCT);
+  InternalFree(TLD.Allocators);
+  TLD.FCT = nullptr;
+  TLD.Allocators = nullptr;
+}
+
+} // namespace
+
+void profilingHandleArg0(int32_t FuncId,
+                         XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+  unsigned char CPU;
+  auto TSC = readTSC(CPU);
+  if (ReentranceGuard)
+    return;
+  ReentranceGuard = true;
+
+  auto Status = __sanitizer::atomic_load(&ProfilerLogStatus,
+                                         __sanitizer::memory_order_acquire);
+  auto &TLD = getThreadLocalData();
+  if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED ||
+               Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) {
+    postCurrentThreadFCT(TLD);
+    ReentranceGuard = false;
+    return;
+  }
+
+  switch (Entry) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    TLD.FCT->enterFunction(FuncId, TSC);
+    break;
+  case XRayEntryType::EXIT:
+  case XRayEntryType::TAIL:
+    TLD.FCT->exitFunction(FuncId, TSC);
+    break;
+  default:
+    // FIXME: Handle bugs.
+    break;
+  }
+
+  ReentranceGuard = false;
+}
+
+void profilingHandleArg1(int32_t FuncId, XRayEntryType Entry,
+                         uint64_t) XRAY_NEVER_INSTRUMENT {
+  return profilingHandleArg0(FuncId, Entry);
+}
+
+XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT {
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+  if (!__sanitizer::atomic_compare_exchange_strong(
+          &ProfilerLogStatus, &CurrentStatus,
+          XRayLogInitStatus::XRAY_LOG_FINALIZING,
+          __sanitizer::memory_order_release)) {
+    if (__sanitizer::Verbosity())
+      Report("Cannot finalize profile, the profiler is not initialized.\n");
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+  }
+
+  // Wait a grace period to allow threads to see that we're finalizing.
+  __sanitizer::SleepForMillis(profilerFlags()->xray_profiling_grace_period_ms);
+
+  // We also want to make sure that the current thread's data is cleaned up, if
+  // we have any.
+  auto &TLD = getThreadLocalData();
+  postCurrentThreadFCT(TLD);
+
+  // Then we force serialize the log data.
+  profileCollectorService::serialize();
+
+  __sanitizer::atomic_store(&ProfilerLogStatus,
+                            XRayLogInitStatus::XRAY_LOG_FINALIZED,
+                            __sanitizer::memory_order_release);
+  return XRayLogInitStatus::XRAY_LOG_FINALIZED;
+}
+
+XRayLogInitStatus
+profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
+                     size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  if (!__sanitizer::atomic_compare_exchange_strong(
+          &ProfilerLogStatus, &CurrentStatus,
+          XRayLogInitStatus::XRAY_LOG_INITIALIZING,
+          __sanitizer::memory_order_release)) {
+    if (__sanitizer::Verbosity())
+      Report(
+          "Cannot initialize already initialised profiling implementation.\n");
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+  }
+
+  // Here we use the sanitizer flag parsing mechanism. When PR36790 is fixed,
+  // migrate to using a different API for configuration.
+  {
+    SpinMutexLock Lock(&ProfilerOptionsMutex);
+    FlagParser ConfigParser;
+    auto *F = profilerFlags();
+    F->setDefaults();
+    registerProfilerFlags(&ConfigParser, F);
+    const char *ProfilerCompileFlags = profilerCompilerDefinedFlags();
+    ConfigParser.ParseString(ProfilerCompileFlags);
+    ConfigParser.ParseString(static_cast<const char *>(Options));
+    if (Verbosity())
+      ReportUnrecognizedFlags();
+  }
+
+  // We need to reset the profile data collection implementation now.
+  profileCollectorService::reset();
+
+  // We need to set up the at-thread-exit handler.
+  static bool UNUSED Once = [] {
+    pthread_key_create(&ProfilingKey, +[](void *) {
+      // This is the thread-exit handler.
+      auto &TLD = getThreadLocalData();
+      if (TLD.Allocators == nullptr && TLD.FCT == nullptr)
+        return;
+
+      postCurrentThreadFCT(TLD);
+    });
+    return false;
+  }();
+
+  __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer);
+  __xray_set_handler(profilingHandleArg0);
+  __xray_set_handler_arg1(profilingHandleArg1);
+
+  __sanitizer::atomic_store(&ProfilerLogStatus,
+                            XRayLogInitStatus::XRAY_LOG_INITIALIZED,
+                            __sanitizer::memory_order_release);
+  if (__sanitizer::Verbosity())
+    Report("XRay Profiling init successful.\n");
+
+  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+}
+
+bool profilingDynamicInitializer() XRAY_NEVER_INSTRUMENT {
+  // Set up the flag defaults from the static defaults and the compiler-provided
+  // defaults.
+  {
+    SpinMutexLock Lock(&ProfilerOptionsMutex);
+    auto *F = profilerFlags();
+    F->setDefaults();
+    FlagParser ProfilingParser;
+    registerProfilerFlags(&ProfilingParser, F);
+    const char *ProfilerCompileFlags = profilerCompilerDefinedFlags();
+    ProfilingParser.ParseString(ProfilerCompileFlags);
+  }
+
+  XRayLogImpl Impl{
+      profilingLoggingInit,
+      profilingFinalize,
+      profilingHandleArg0,
+      profilingFlush,
+  };
+  auto RegistrationResult = __xray_log_register_mode("xray-profiling", Impl);
+  if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
+      __sanitizer::Verbosity())
+    Report(
+        "Cannot register XRay Profiling mode to 'xray-profiling'; error = %d\n",
+        RegistrationResult);
+  if (!__sanitizer::internal_strcmp(flags()->xray_mode, "xray-profiling"))
+    __xray_set_log_impl(Impl);
+  return true;
+}
+
+} // namespace __xray
+
+static auto UNUSED Unused = __xray::profilingDynamicInitializer();
Index: compiler-rt/lib/xray/xray_profiler_flags.h
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/xray_profiler_flags.h
@@ -0,0 +1,39 @@
+//===-- xray_profiler_flags.h ----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay profiler runtime flags.
+//===----------------------------------------------------------------------===//
+
+#ifndef XRAY_PROFILER_FLAGS_H
+#define XRAY_PROFILER_FLAGS_H
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+struct ProfilerFlags {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "xray_profiler_flags.inc"
+#undef XRAY_FLAG
+
+  void setDefaults();
+};
+
+extern ProfilerFlags xray_profiler_flags_dont_use_directly;
+inline ProfilerFlags *profilerFlags() {
+  return &xray_profiler_flags_dont_use_directly;
+}
+void registerProfilerFlags(FlagParser *P, ProfilerFlags *F);
+
+} // namespace __xray
+
+#endif // XRAY_PROFILER_FLAGS_H
Index: compiler-rt/lib/xray/xray_profiler_flags.cc
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/xray_profiler_flags.cc
@@ -0,0 +1,40 @@
+//===-- xray_flags.h -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay runtime flags.
+//===----------------------------------------------------------------------===//
+
+#include "xray_profiler_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+namespace __xray {
+
+// Storage for the profiler flags.
+ProfilerFlags xray_profiler_flags_dont_use_directly;
+
+void ProfilerFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_profiler_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerProfilerFlags(FlagParser *P,
+                           ProfilerFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_profiler_flags.inc"
+#undef XRAY_FLAG
+}
+
+} // namespace __xray
Index: compiler-rt/lib/xray/xray_profiler_flags.inc
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/xray_profiler_flags.inc
@@ -0,0 +1,26 @@
+//===-- xray_flags.inc ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay profiling runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FLAG
+#error "Define XRAY_FLAG prior to including this file!"
+#endif
+
+XRAY_FLAG(uptr, xray_profiling_per_thread_allocator_max, 2 << 20,
+          "Maximum size of any single per-thread allocator.")
+XRAY_FLAG(uptr, xray_profiling_global_allocator_max, 2 << 24,
+          "Maximum size of the global allocator for profile storage.")
+XRAY_FLAG(uptr, xray_profiling_stack_allocator_max, 2 << 24,
+          "Maximum size of the traversal stack allocator.")
+XRAY_FLAG(int, xray_profiling_grace_period_ms, 100,
+          "Profile collection will wait this much time in milliseconds before "
+          "resetting the global state. This gives a chance to threads to "
+          "notice that the profiler has been finalized and clean up.")
Index: compiler-rt/lib/xray/xray_segmented_array.h
===================================================================
--- /dev/null
+++ compiler-rt/lib/xray/xray_segmented_array.h
@@ -0,0 +1,292 @@
+//===-- xray_segmented_array.h ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Defines the implementation of a segmented array, with fixed-size chunks
+// backing the segments.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_SEGMENTED_ARRAY_H
+#define XRAY_SEGMENTED_ARRAY_H
+
+#include "sanitizer_common/sanitizer_allocator.h"
+#include "xray_allocator.h"
+#include <type_traits>
+#include <utility>
+
+namespace __xray {
+
+/// The Array type provides an interface similar to std::vector<...> but does
+/// not shrink in size. Once constructed, elements can be appended but cannot be
+/// removed. The implementation is heavily dependent on the contract provided by
+/// the Allocator type, in that all memory will be released when the Allocator
+/// is destroyed. When an Array is destroyed, it will destroy elements in the
+/// backing store but will not free the memory. The parameter N defines how many
+/// elements of T there should be in a single block. By default we compute this
+/// number from packing as many T's as possible in a single cache line.
+template <class T, size_t N = (kCacheLineSize / sizeof(T)) +
+                              (kCacheLineSize % sizeof(T) ? 1 : 0)>
+struct Array {
+  static constexpr size_t AllocatorChunkSize = sizeof(T) * N;
+  using AllocatorType = Allocator<AllocatorChunkSize>;
+  static_assert(std::is_trivially_destructible<T>::value,
+                "T must be trivially destructible.");
+
+private:
+  struct Chunk {
+    typename AllocatorType::Block Block;
+    static constexpr size_t Size = N;
+    Chunk *Prev = nullptr;
+    Chunk *Next = nullptr;
+  };
+
+  static Chunk SentinelChunk;
+
+  AllocatorType *Allocator;
+  Chunk *Head = &SentinelChunk;
+  Chunk *Tail = &SentinelChunk;
+  size_t Size = 0;
+
+  Chunk *NewChunk() {
+    auto Block = Allocator->Allocate();
+    if (Block.Data == nullptr)
+      return nullptr;
+    // TODO: Maybe use a separate managed allocator for Chunk instances?
+    auto C = reinterpret_cast<Chunk *>(InternalAlloc(sizeof(Chunk)));
+    if (C == nullptr)
+      return nullptr;
+    C->Block = Block;
+    return C;
+  }
+
+  static AllocatorType &GetGlobalAllocator() {
+    static AllocatorType *const GlobalAllocator = [] {
+      AllocatorType *A = reinterpret_cast<AllocatorType *>(
+          InternalAlloc(sizeof(AllocatorType)));
+      new (A) AllocatorType(2 << 10, 0);
+      return A;
+    }();
+
+    return *GlobalAllocator;
+  }
+
+  Chunk *InitHeadAndTail() {
+    DCHECK_EQ(Head, &SentinelChunk);
+    DCHECK_EQ(Tail, &SentinelChunk);
+    auto Chunk = NewChunk();
+    if (Chunk == nullptr)
+      return nullptr;
+    Chunk->Prev = &SentinelChunk;
+    Chunk->Next = &SentinelChunk;
+    Head = Chunk;
+    Tail = Chunk;
+    return Chunk;
+  }
+
+  Chunk *AppendNewChunk() {
+    auto Chunk = NewChunk();
+    if (Chunk == nullptr)
+      return nullptr;
+    Tail->Next = Chunk;
+    Chunk->Prev = Tail;
+    Chunk->Next = &SentinelChunk;
+    Tail = Chunk;
+    return Chunk;
+  }
+
+  class Iterator {
+    Chunk *C = nullptr;
+    size_t Offset = 0;
+
+  public:
+    Iterator(Chunk *IC, size_t O) : C(IC), Offset(O) {}
+
+    Iterator &operator++() {
+      DCHECK_NE(C, nullptr);
+      if (++Offset % N)
+        return *this;
+
+      // At this point, we know that Offset % N == 0, so we must advance the
+      // chunk pointer.
+      DCHECK_EQ(Offset % N, 0);
+      C = C->Next;
+      return *this;
+    }
+
+    Iterator &operator--() {
+      DCHECK_NE(C, nullptr);
+      if (--Offset % N) {
+        return *this;
+      }
+
+      // At this point, we know that Offset % N == 0, so we must retreat the
+      // chunk pointer.
+      DCHECK_EQ(Offset % N, 0);
+      C = C->Prev;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      Iterator Copy(*this);
+      if (++Offset % N == 0)
+        C = C->Next;
+      DCHECK_NE(Copy.Offset, this->Offset);
+      return Copy;
+    }
+
+    Iterator operator--(int) {
+      Iterator Copy(*this);
+      if (--Offset % N == 0)
+        C = C->Prev;
+      DCHECK_NE(Copy.Offset, this->Offset);
+      return Copy;
+    }
+
+    friend bool operator==(const Iterator &L, const Iterator &R) {
+      return L.C == R.C && L.Offset == R.Offset;
+    }
+
+    friend bool operator!=(const Iterator &L, const Iterator &R) {
+      return !(L == R);
+    }
+
+    const T &operator*() const {
+      DCHECK_NE(C, &SentinelChunk);
+      return reinterpret_cast<const T *>(C->Block.Data)[Offset % N];
+    }
+
+    T &operator*() {
+      DCHECK_NE(C, &SentinelChunk);
+      return reinterpret_cast<T *>(C->Block.Data)[Offset % N];
+    }
+  };
+
+public:
+  explicit Array(AllocatorType &A) : Allocator(&A) {}
+  Array() : Array(GetGlobalAllocator()) {}
+
+  Array(const Array &) = delete;
+  Array(Array &&O) NOEXCEPT : Allocator(O.Allocator),
+                              Head(O.Head),
+                              Tail(O.Tail),
+                              Size(O.Size) {
+    O.Head = &SentinelChunk;
+    O.Tail = &SentinelChunk;
+    O.Size = 0;
+  }
+
+  bool empty() const { return Size == 0; }
+
+  AllocatorType &allocator() const {
+    DCHECK_NE(Allocator, nullptr);
+    return *Allocator;
+  }
+
+  size_t size() const { return Size; }
+
+  T *Append(const T &E) {
+    if (UNLIKELY(Head == &SentinelChunk))
+      if (InitHeadAndTail() == nullptr)
+        return nullptr;
+
+    auto Offset = Size % N;
+    if (UNLIKELY(Size != 0 && Offset == 0))
+      if (AppendNewChunk() == nullptr)
+        return nullptr;
+
+    auto Position = reinterpret_cast<T *>(Tail->Block.Data) + Offset;
+    *Position = E;
+    ++Size;
+    return Position;
+  }
+
+  template <class... Args> T *AppendEmplace(Args &&... args) {
+    if (UNLIKELY(Head == &SentinelChunk))
+      if (InitHeadAndTail() == nullptr)
+        return nullptr;
+
+    auto Offset = Size % N;
+    if (UNLIKELY(Size != 0 && Offset == 0))
+      if (AppendNewChunk() == nullptr)
+        return nullptr;
+
+    auto Position = reinterpret_cast<T *>(Tail->Block.Data) + Offset;
+    // In-place construct at Position.
+    new (Position) T(std::forward<Args>(args)...);
+    ++Size;
+    return Position;
+  }
+
+  T &operator[](size_t Offset) const {
+    DCHECK_LE(Offset, Size);
+    // We need to traverse the array enough times to find the element at Offset.
+    auto C = Head;
+    while (Offset >= N) {
+      C = C->Next;
+      Offset -= N;
+      DCHECK_NE(C, &SentinelChunk);
+    }
+    auto Position = reinterpret_cast<T *>(C->Block.Data) + Offset;
+    return *Position;
+  }
+
+  T &front() const {
+    DCHECK_NE(Head, &SentinelChunk);
+    DCHECK_NE(Size, 0u);
+    return *reinterpret_cast<T *>(Head->Block.Data);
+  }
+
+  T &back() const {
+    DCHECK_NE(Tail, &SentinelChunk);
+    auto Offset = (Size - 1) % N;
+    return *(reinterpret_cast<T *>(Tail->Block.Data) + Offset);
+  }
+
+  template <class Predicate> T *find_element(Predicate P) const {
+    if (empty())
+      return nullptr;
+
+    auto C = Head;
+    auto E = Size;
+    while (E != 0) {
+      DCHECK(C != &SentinelChunk);
+      auto LE = N < E ? N : E;
+      auto end = reinterpret_cast<T *>(C->Block.Data) + LE;
+      for (auto It = reinterpret_cast<T *>(C->Block.Data); It != end; ++It)
+        if (P(*It))
+          return It;
+      C = C->Next;
+      E -= LE;
+    }
+    return nullptr;
+  }
+
+  /// Remove N Elements from the end.
+  void trim(size_t Elements) {
+    // TODO: Figure out whether we need to actually destroy the objects that are
+    // going to be trimmed -- determine whether assuming that trivially
+    // destructible objects are OK to discard, from the XRay implementation.
+    DCHECK_LE(Elements, Size);
+    Size -= Elements;
+  }
+
+  // Provide iterators.
+  Iterator begin() const { return Iterator(Head, 0); }
+  Iterator end() const { return Iterator(Tail, Size); }
+  Iterator cbegin() const { return Iterator(Head, 0); }
+  Iterator cend() const { return Iterator(Tail, Size); }
+};
+
+template <class T, size_t N>
+typename Array<T, N>::Chunk Array<T, N>::SentinelChunk;
+
+} // namespace __xray
+
+#endif // XRAY_SEGMENTED_ARRAY_H
Index: compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cc
===================================================================
--- /dev/null
+++ compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cc
@@ -0,0 +1,53 @@
+// Check that we can get a profile from a single-threaded application, on
+// demand through the XRay logging implementation API.
+//
+// FIXME: Make -fxray-modes=xray-profiling part of the default?
+// RUN: %clangxx_xray -std=c++11 %s -o %t -fxray-modes=xray-profiler
+// RUN: %run %t
+//
+// UNSUPPORTED: target-is-mips64,target-is-mips64el
+
+#include "xray/xray_interface.h"
+#include "xray/xray_log_interface.h"
+#include <cassert>
+#include <cstdio>
+#include <string>
+#include <thread>
+
+#define XRAY_ALWAYS_INSTRUMENT [[clang::xray_always_instrument]]
+#define XRAY_NEVER_INSTRUMENT [[clang::xray_never_instrument]]
+
+XRAY_ALWAYS_INSTRUMENT void f2() { return; }
+XRAY_ALWAYS_INSTRUMENT void f1() { f2(); }
+XRAY_ALWAYS_INSTRUMENT void f0() { f1(); }
+
+using namespace std;
+
+volatile int buffer_counter = 0;
+
+XRAY_NEVER_INSTRUMENT void process_buffer(const char *, XRayBuffer) {
+  // FIXME: Actually assert the contents of the buffer.
+  ++buffer_counter;
+}
+
+XRAY_ALWAYS_INSTRUMENT int main(int, char **) {
+  assert(__xray_log_select_mode("xray-profiling") ==
+         XRayLogRegisterStatus::XRAY_REGISTRATION_OK);
+  assert(__xray_log_get_current_mode() != nullptr);
+  std::string current_mode = __xray_log_get_current_mode();
+  assert(current_mode == "xray-profiling");
+  assert(__xray_patch() == XRayPatchingStatus::SUCCESS);
+  assert(__xray_log_init(0, 0, nullptr, 0) ==
+         XRayLogInitStatus::XRAY_LOG_INITIALIZED);
+  std::thread t0([] { f0(); });
+  std::thread t1([] { f0(); });
+  f0();
+  t0.join();
+  t1.join();
+  assert(__xray_log_finalize() == XRayLogInitStatus::XRAY_LOG_FINALIZED);
+  assert(__xray_log_process_buffers(process_buffer) ==
+         XRayLogFlushStatus::XRAY_LOG_FLUSHED);
+  // We're running three threds, so we expect three buffers.
+  assert(buffer_counter == 3);
+  assert(__xray_log_flushLog() == XRayLogFlushStatus::XRAY_LOG_FLUSHED);
+}
Index: compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cc
===================================================================
--- /dev/null
+++ compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cc
@@ -0,0 +1,48 @@
+// Check that we can get a profile from a single-threaded application, on
+// demand through the XRay logging implementation API.
+//
+// FIXME: Make -fxray-modes=xray-profiling part of the default?
+// RUN: %clangxx_xray -std=c++11 %s -o %t -fxray-modes=xray-profiler
+// RUN: %run %t
+//
+// UNSUPPORTED: target-is-mips64,target-is-mips64el
+
+#include "xray/xray_interface.h"
+#include "xray/xray_log_interface.h"
+#include <cassert>
+#include <cstdio>
+#include <string>
+
+#define ALWAYS_INSTRUMENT [[clang::xray_always_instrument]]
+#define NEVER_INSTRUMENT [[clang::xray_never_instrument]]
+
+ALWAYS_INSTRUMENT void f2() { return; }
+ALWAYS_INSTRUMENT void f1() { f2(); }
+ALWAYS_INSTRUMENT void f0() { f1(); }
+
+using namespace std;
+
+volatile int buffer_counter = 0;
+
+NEVER_INSTRUMENT void process_buffer(const char *, XRayBuffer) {
+  // FIXME: Actually assert the contents of the buffer.
+  ++buffer_counter;
+}
+
+ALWAYS_INSTRUMENT int main(int, char **) {
+  assert(__xray_log_select_mode("xray-profiling") ==
+         XRayLogRegisterStatus::XRAY_REGISTRATION_OK);
+  assert(__xray_log_get_current_mode() != nullptr);
+  std::string current_mode = __xray_log_get_current_mode();
+  assert(current_mode == "xray-profiling");
+  assert(__xray_patch() == XRayPatchingStatus::SUCCESS);
+  assert(__xray_log_init(0, 0, nullptr, 0) ==
+         XRayLogInitStatus::XRAY_LOG_INITIALIZED);
+  f0();
+  assert(__xray_log_finalize() == XRayLogInitStatus::XRAY_LOG_FINALIZED);
+  f0();
+  assert(__xray_log_process_buffers(process_buffer) ==
+         XRayLogFlushStatus::XRAY_LOG_FLUSHED);
+  assert(buffer_counter == 1);
+  assert(__xray_log_flushLog() == XRayLogFlushStatus::XRAY_LOG_FLUSHED);
+}