Index: lib/xray/CMakeLists.txt
===================================================================
--- lib/xray/CMakeLists.txt
+++ lib/xray/CMakeLists.txt
@@ -2,7 +2,8 @@
 
 set(XRAY_SOURCES
   xray_init.cc
-	xray_interface.cc
+  xray_interface.cc
+  xray_inmemory_log.cc
 )
 
 include_directories(..)
@@ -12,9 +13,9 @@
 set(XRAY_COMMON_DEFINITIONS XRAY_HAS_EXCEPTIONS=1)
 
 add_compiler_rt_object_libraries(RTXray
-		ARCHS ${XRAY_SUPPORTED_ARCH}
-		SOURCES ${XRAY_SOURCES} CFLAGS ${XRAY_CFLAGS}
-		DEFS ${XRAY_COMMON_DEFINITIONS})
+    ARCHS ${XRAY_SUPPORTED_ARCH}
+    SOURCES ${XRAY_SOURCES} CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS})
 
 add_custom_target(xray)
 set(XRAY_COMMON_RUNTIME_OBJECT_LIBS RTXray)
Index: lib/xray/xray_init.cc
===================================================================
--- lib/xray/xray_init.cc
+++ lib/xray/xray_init.cc
@@ -70,19 +70,6 @@
   }
 }
 
-extern "C" {
-void __xray_DemoLog(int32_t FuncId, unsigned short Type) {
-  uint64_t Hi;
-  uint32_t Lo, CPUId;
-  __asm__ __volatile__("rdtscp" : "=a"(Lo), "=d"(Hi), "=c"(CPUId));
-  int ignored = printf(
-      "%lu: [%lu] %s%d\n", CPUId, (Hi << 32) | Lo,
-      Type == static_cast<unsigned short>(__xray::EntryType::ENTRY) ? "E" : "X",
-      FuncId);
-  (void)(ignored);
-}
-}
-
 // __xray_init() will do the actual loading of the current process' memory map
 // and then proceed to look for the .xray_instr_map section/segment.
 void __xray_init() {
@@ -101,9 +88,8 @@
   XRayInstrMap.store(SledMap, std::memory_order_release);
   XRayInitialized.store(true, std::memory_order_release);
 
-  // FIXME: Only for demo, patch the functions before we run main.
+  // FIXME: Only for demo, dump the table and patch functions before main.
   __xray_dump();
-  __xray_set_handler(__xray_DemoLog);
   __xray_patch();
 }
 
Index: lib/xray/xray_inmemory_log.cc
===================================================================
--- /dev/null
+++ lib/xray/xray_inmemory_log.cc
@@ -0,0 +1,134 @@
+//===-- xray_inmemory_log.cc ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of a simple in-memory log of XRay events. This defines a
+// logging function that's compatible with the XRay handler interface, and
+// routines for exporting data to files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "xray_interface_internal.h"
+
+#include <mutex>
+#include <thread>
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <x86intrin.h>
+
+extern "C" {
+void __xray_InMemoryRawLog(int32_t FuncId, unsigned short Type);
+}
+
+// __xray_InMemoryRawLog will use a thread-local aligned buffer capped to a
+// certain size (32kb by default) and use it as if it were a circular buffer for
+// events. We store simple fixed-sized entries in the log for external analysis.
+namespace __xray {
+struct alignas(32) XRayRecord {
+  // Get the full 8 bytes of the TSC when we get the log record.
+  uint64_t TSC = 0;
+
+  // The thread ID for the currently running thread.
+  pid_t TId = 0;
+
+  // The CPU where the thread is running. We assume number of CPUs <= 256.
+  uint8_t CPU = 0;
+
+  // The type of the event. Usually either ENTER = 0 or EXIT = 1.
+  uint8_t Type = 0;
+
+  // The function ID for the record.
+  int32_t FuncId = 0;
+};
+
+static_assert(sizeof(XRayRecord) == 32, "XRayRecord != 32 bytes");
+
+std::mutex LogMutex;
+
+static constexpr size_t BuffLen = 1024;
+
+class ThreadExitFlusher {
+public:
+  explicit ThreadExitFlusher(int Fd, XRayRecord *Start, size_t &Offset)
+      : Fd(Fd), Start(Start), Offset(Offset) {}
+
+  ~ThreadExitFlusher() noexcept {
+    std::lock_guard<std::mutex> L(LogMutex);
+    write(Fd, Start, sizeof(XRayRecord) * Offset);
+    fsync(Fd);
+  }
+
+private:
+  int Fd;
+  XRayRecord *Start;
+  size_t &Offset;
+};
+}
+
+void __xray_InMemoryRawLog(int32_t FuncId, unsigned short Type) {
+  using Buffer = std::aligned_storage<sizeof(__xray::XRayRecord),
+                                      alignof(__xray::XRayRecord)>::type;
+  thread_local static Buffer InMemoryBuffer[__xray::BuffLen] = {};
+  thread_local static size_t Offset = 0;
+  static int Fd = [] {
+    // Open a temporary file once for the log.
+    static char TmpFilename[] = "/tmp/xray-log-XXXXXX";
+    int Fd = mkstemp(TmpFilename);
+    if (Fd == -1) {
+      printf("Failed opening temporary file '%s'; not logging events.",
+             TmpFilename);
+      return -1;
+    }
+    printf("XRay: Log file in '%s'\n", TmpFilename);
+    return Fd;
+  }();
+  if (Fd == -1)
+    return;
+  thread_local __xray::ThreadExitFlusher Flusher(
+      Fd, reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer), Offset);
+  thread_local pid_t TId = syscall(SYS_gettid);
+
+  // First we get the useful data, and stuff it into the already aligned buffer
+  // through a pointer offset.
+  auto &R = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer)[Offset];
+  uint32_t CPU;
+  R.TSC = __rdtscp(&CPU);
+  R.CPU = CPU;
+  R.TId = TId;
+  R.Type = Type;
+  R.FuncId = FuncId;
+  ++Offset;
+  if (Offset == __xray::BuffLen) {
+    std::lock_guard<std::mutex> L(__xray::LogMutex);
+    auto TotalBytes = __xray::BuffLen * sizeof(__xray::XRayRecord);
+    while (auto Written = write(Fd, InMemoryBuffer,
+                                sizeof(__xray::XRayRecord) * __xray::BuffLen)) {
+      if (Written == -1) {
+        // FIXME: Failed writing, bail out.
+        printf("Failed to write; errno = %d", errno);
+        break;
+      }
+      TotalBytes -= Written;
+      if (TotalBytes == 0)
+        break;
+    }
+    Offset = 0;
+  }
+}
+
+static auto Unused = [] {
+  __xray_set_handler(__xray_InMemoryRawLog);
+  return true;
+}();