Index: compiler-rt/trunk/cmake/config-ix.cmake
===================================================================
--- compiler-rt/trunk/cmake/config-ix.cmake
+++ compiler-rt/trunk/cmake/config-ix.cmake
@@ -161,7 +161,7 @@
 set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64})
 set(ALL_ESAN_SUPPORTED_ARCH ${X86_64} ${MIPS64})
 set(ALL_SCUDO_SUPPORTED_ARCH ${X86_64})
-set(ALL_XRAY_SUPPORTED_ARCH ${X86_64})
+set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32})
 
 if(APPLE)
   include(CompilerRTDarwinUtils)
Index: compiler-rt/trunk/lib/sanitizer_common/scripts/gen_dynamic_list.py
===================================================================
--- compiler-rt/trunk/lib/sanitizer_common/scripts/gen_dynamic_list.py
+++ compiler-rt/trunk/lib/sanitizer_common/scripts/gen_dynamic_list.py
@@ -19,6 +19,7 @@
 import re
 import subprocess
 import sys
+import platform
 
 new_delete = set([
                   '_Znam', '_ZnamRKSt9nothrow_t',    # operator new[](unsigned long)
@@ -50,7 +51,7 @@
     raise subprocess.CalledProcessError(nm_proc.returncode, nm)
   func_symbols = ['T', 'W']
   # On PowerPC, nm prints function descriptors from .data section.
-  if os.uname()[4] in ["powerpc", "ppc64"]:
+  if platform.uname()[4] in ["powerpc", "ppc64"]:
     func_symbols += ['D']
   for line in nm_out:
     cols = line.split(' ')
Index: compiler-rt/trunk/lib/xray/CMakeLists.txt
===================================================================
--- compiler-rt/trunk/lib/xray/CMakeLists.txt
+++ compiler-rt/trunk/lib/xray/CMakeLists.txt
@@ -8,9 +8,17 @@
 )
 
 set(x86_64_SOURCES
+		xray_x86_64.cc
 		xray_trampoline_x86_64.S
 		${XRAY_SOURCES})
 
+set(arm_SOURCES
+		xray_arm.cc
+		xray_trampoline_arm.S
+		${XRAY_SOURCES})
+
+set(armhf_SOURCES ${arm_SOURCES})
+
 include_directories(..)
 include_directories(../../include)
 
Index: compiler-rt/trunk/lib/xray/xray_arm.cc
===================================================================
--- compiler-rt/trunk/lib/xray/xray_arm.cc
+++ compiler-rt/trunk/lib/xray/xray_arm.cc
@@ -0,0 +1,131 @@
+//===-- xray_arm.cpp --------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of ARM-specific routines (32-bit).
+//
+//===----------------------------------------------------------------------===//
+#include "xray_interface_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include <atomic>
+#include <cassert>
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum class PatchOpcodes : uint32_t
+{
+  PO_PushR0Lr = 0xE92D4001, // PUSH {r0, lr}
+  PO_BlxIp = 0xE12FFF3C, // BLX ip
+  PO_PopR0Lr = 0xE8BD4001, // POP {r0, lr}
+  PO_B20 = 0xEA000005 // B #20
+};
+
+// 0xUUUUWXYZ -> 0x000W0XYZ
+inline static uint32_t getMovwMask(const uint32_t Value) {
+  return (Value & 0xfff) | ((Value & 0xf000) << 4);
+}
+
+// 0xWXYZUUUU -> 0x000W0XYZ
+inline static uint32_t getMovtMask(const uint32_t Value) {
+  return getMovwMask(Value >> 16);
+}
+
+// Writes the following instructions:
+//   MOVW R<regNo>, #<lower 16 bits of the |Value|>
+//   MOVT R<regNo>, #<higher 16 bits of the |Value|>
+inline static uint32_t* write32bitLoadReg(uint8_t regNo, uint32_t* Address,
+    const uint32_t Value) {
+  //This is a fatal error: we cannot just report it and continue execution.
+  assert(regNo <= 15 && "Register number must be 0 to 15.");
+  // MOVW R, #0xWXYZ in machine code is 0xE30WRXYZ
+  *Address = (0xE3000000 | (uint32_t(regNo)<<12) | getMovwMask(Value));
+  Address++;
+  // MOVT R, #0xWXYZ in machine code is 0xE34WRXYZ
+  *Address = (0xE3400000 | (uint32_t(regNo)<<12) | getMovtMask(Value));
+  return Address + 1;
+}
+
+// Writes the following instructions:
+//   MOVW r0, #<lower 16 bits of the |Value|>
+//   MOVT r0, #<higher 16 bits of the |Value|>
+inline static uint32_t *Write32bitLoadR0(uint32_t *Address,
+                                         const uint32_t Value) {
+  return write32bitLoadReg(0, Address, Value);
+}
+
+// Writes the following instructions:
+//   MOVW ip, #<lower 16 bits of the |Value|>
+//   MOVT ip, #<higher 16 bits of the |Value|>
+inline static uint32_t *Write32bitLoadIP(uint32_t *Address,
+                                         const uint32_t Value) {
+  return write32bitLoadReg(12, Address, Value);
+}
+
+inline static bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled, void (*TracingHook)()) {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // xray_sled_n:
+  //   B #20
+  //   6 NOPs (24 bytes)
+  //
+  // With the following runtime patch:
+  //
+  // xray_sled_n:
+  //   PUSH {r0, lr}
+  //   MOVW r0, #<lower 16 bits of function ID>
+  //   MOVT r0, #<higher 16 bits of function ID>
+  //   MOVW ip, #<lower 16 bits of address of TracingHook>
+  //   MOVT ip, #<higher 16 bits of address of TracingHook>
+  //   BLX ip
+  //   POP {r0, lr}
+  //
+  // Replacement of the first 4-byte instruction should be the last and atomic
+  // operation, so that the user code which reaches the sled concurrently
+  // either jumps over the whole sled, or executes the whole sled when the
+  // latter is ready.
+  //
+  // When |Enable|==false, we set back the first instruction in the sled to be
+  //   B #20
+
+  uint32_t *FirstAddress = reinterpret_cast<uint32_t *>(Sled.Address);
+  if (Enable) {
+    uint32_t *CurAddress = FirstAddress + 1;
+    CurAddress =
+        Write32bitLoadR0(CurAddress, reinterpret_cast<uint32_t>(FuncId));
+    CurAddress =
+        Write32bitLoadIP(CurAddress, reinterpret_cast<uint32_t>(TracingHook));
+    *CurAddress = uint32_t(PatchOpcodes::PO_BlxIp);
+    CurAddress++;
+    *CurAddress = uint32_t(PatchOpcodes::PO_PopR0Lr);
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+        uint32_t(PatchOpcodes::PO_PushR0Lr), std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+        uint32_t(PatchOpcodes::PO_B20), std::memory_order_release);
+  }
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled) {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionEntry);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+} // namespace __xray
Index: compiler-rt/trunk/lib/xray/xray_inmemory_log.cc
===================================================================
--- compiler-rt/trunk/lib/xray/xray_inmemory_log.cc
+++ compiler-rt/trunk/lib/xray/xray_inmemory_log.cc
@@ -24,7 +24,14 @@
 #include <sys/types.h>
 #include <thread>
 #include <unistd.h>
-#include <x86intrin.h>
+
+#if defined(__x86_64__)
+  #include <x86intrin.h>
+#elif defined(__arm__)
+  static const int64_t NanosecondsPerSecond = 1000LL*1000*1000;
+#else
+  #error "Unsupported CPU Architecture"
+#endif /* CPU architecture */
 
 #include "sanitizer_common/sanitizer_libc.h"
 #include "xray/xray_records.h"
@@ -61,6 +68,7 @@
   }
 }
 
+#if defined(__x86_64__)
 static std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin,
                                                  char *End) {
   auto BytesToRead = std::distance(Begin, End);
@@ -103,6 +111,8 @@
   return Result;
 }
 
+#endif /* CPU architecture */
+
 class ThreadExitFlusher {
   int Fd;
   XRayRecord *Start;
@@ -164,6 +174,7 @@
 
     // Get the cycle frequency from SysFS on Linux.
     long long CPUFrequency = -1;
+#if defined(__x86_64__)
     if (readValueFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz",
                           &CPUFrequency)) {
       CPUFrequency *= 1000;
@@ -174,6 +185,20 @@
     } else {
       Report("Unable to determine CPU frequency for TSC accounting.");
     }
+#elif defined(__arm__)
+    // There is no instruction like RDTSCP in user mode on ARM. ARM's CP15 does
+    //   not have a constant frequency like TSC on x86(_64), it may go faster
+    //   or slower depending on CPU turbo or power saving mode. Furthermore,
+    //   to read from CP15 on ARM a kernel modification or a driver is needed.
+    //   We can not require this from users of compiler-rt.
+    // So on ARM we use clock_gettime() which gives the result in nanoseconds.
+    //   To get the measurements per second, we scale this by the number of
+    //   nanoseconds per second, pretending that the TSC frequency is 1GHz and
+    //   one TSC tick is 1 nanosecond.
+    CPUFrequency = NanosecondsPerSecond;
+#else
+  #error "Unsupported CPU Architecture"
+#endif /* CPU architecture */
 
     // Since we're here, we get to write the header. We set it up so that the
     // header will only be written once, at the start, and let the threads
@@ -201,10 +226,29 @@
   // First we get the useful data, and stuff it into the already aligned buffer
   // through a pointer offset.
   auto &R = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer)[Offset];
-  unsigned CPU;
   R.RecordType = RecordTypes::NORMAL;
-  R.TSC = __rdtscp(&CPU);
-  R.CPU = CPU;
+#if defined(__x86_64__)
+  {
+    unsigned CPU;
+    R.TSC = __rdtscp(&CPU);
+    R.CPU = CPU;
+  }
+#elif defined(__arm__)
+  {
+    timespec TS;
+    int result = clock_gettime(CLOCK_REALTIME, &TS);
+    if(result != 0)
+    {
+      Report("clock_gettime() returned %d, errno=%d.", result, int(errno));
+      TS.tv_sec = 0;
+      TS.tv_nsec = 0;
+    }
+    R.TSC = TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
+    R.CPU = 0;
+  }
+#else
+  #error "Unsupported CPU Architecture"
+#endif /* CPU architecture */
   R.TId = TId;
   R.Type = Type;
   R.FuncId = FuncId;
Index: compiler-rt/trunk/lib/xray/xray_interface.cc
===================================================================
--- compiler-rt/trunk/lib/xray/xray_interface.cc
+++ compiler-rt/trunk/lib/xray/xray_interface.cc
@@ -26,6 +26,15 @@
 
 namespace __xray {
 
+#if defined(__x86_64__)
+  // FIXME: The actual length is 11 bytes. Why was length 12 passed to mprotect() ?
+  static const int16_t cSledLength = 12;
+#elif defined(__arm__)
+  static const int16_t cSledLength = 28;
+#else
+  #error "Unsupported CPU Architecture"
+#endif /* CPU architecture */
+
 // This is the function to call when we encounter the entry or exit sleds.
 std::atomic<void (*)(int32_t, XRayEntryType)> XRayPatchedFunction{nullptr};
 
@@ -64,13 +73,6 @@
 
 } // namespace __xray
 
-extern "C" {
-// The following functions have to be defined in assembler, on a per-platform
-// basis. See xray_trampoline_*.s files for implementations.
-extern void __xray_FunctionEntry();
-extern void __xray_FunctionExit();
-}
-
 extern std::atomic<bool> XRayInitialized;
 extern std::atomic<__xray::XRaySledMap> XRayInstrMap;
 
@@ -133,12 +135,13 @@
   if (InstrMap.Entries == 0)
     return XRayPatchingStatus::NOT_INITIALIZED;
 
-  int32_t FuncId = 1;
-  static constexpr uint8_t CallOpCode = 0xe8;
-  static constexpr uint16_t MovR10Seq = 0xba41;
-  static constexpr uint16_t Jmp9Seq = 0x09eb;
-  static constexpr uint8_t JmpOpCode = 0xe9;
-  static constexpr uint8_t RetOpCode = 0xc3;
+  const uint64_t PageSize = GetPageSizeCached();
+  if((PageSize == 0) || ( (PageSize & (PageSize-1)) != 0) ) {
+    Report("System page size is not a power of two: %lld", PageSize);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  uint32_t FuncId = 1;
   uint64_t CurFun = 0;
   for (std::size_t I = 0; I < InstrMap.Entries; I++) {
     auto Sled = InstrMap.Sleds[I];
@@ -153,112 +156,28 @@
     // While we're here, we should patch the nop sled. To do that we mprotect
     // the page containing the function to be writeable.
     void *PageAlignedAddr =
-        reinterpret_cast<void *>(Sled.Address & ~((2 << 16) - 1));
+        reinterpret_cast<void *>(Sled.Address & ~(PageSize-1));
     std::size_t MProtectLen =
-        (Sled.Address + 12) - reinterpret_cast<uint64_t>(PageAlignedAddr);
+        (Sled.Address + cSledLength) - reinterpret_cast<uint64_t>(PageAlignedAddr);
     MProtectHelper Protector(PageAlignedAddr, MProtectLen);
     if (Protector.MakeWriteable() == -1) {
       printf("Failed mprotect: %d\n", errno);
       return XRayPatchingStatus::FAILED;
     }
 
-    static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
-    static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
-    if (Sled.Kind == XRayEntryType::ENTRY) {
-      // FIXME: Implement this in a more extensible manner, per-platform.
-      // Here we do the dance of replacing the following sled:
-      //
-      // xray_sled_n:
-      //   jmp +9
-      //   <9 byte nop>
-      //
-      // With the following:
-      //
-      //   mov r10d, <function id>
-      //   call <relative 32bit offset to entry trampoline>
-      //
-      // We need to do this in the following order:
-      //
-      // 1. Put the function id first, 2 bytes from the start of the sled (just
-      // after the 2-byte jmp instruction).
-      // 2. Put the call opcode 6 bytes from the start of the sled.
-      // 3. Put the relative offset 7 bytes from the start of the sled.
-      // 4. Do an atomic write over the jmp instruction for the "mov r10d"
-      // opcode and first operand.
-      //
-      // Prerequisite is to compute the relative offset to the
-      // __xray_FunctionEntry function's address.
-      int64_t TrampolineOffset =
-          reinterpret_cast<int64_t>(__xray_FunctionEntry) -
-          (static_cast<int64_t>(Sled.Address) + 11);
-      if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
-        Report("XRay Entry trampoline (%p) too far from sled (%p); distance = "
-               "%ld\n",
-               __xray_FunctionEntry, reinterpret_cast<void *>(Sled.Address),
-               TrampolineOffset);
-        continue;
-      }
-      if (Enable) {
-        *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
-        *reinterpret_cast<uint8_t *>(Sled.Address + 6) = CallOpCode;
-        *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
-        std::atomic_store_explicit(
-            reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
-            std::memory_order_release);
-      } else {
-        std::atomic_store_explicit(
-            reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp9Seq,
-            std::memory_order_release);
-        // FIXME: Write out the nops still?
-      }
-    }
-
-    if (Sled.Kind == XRayEntryType::EXIT) {
-      // FIXME: Implement this in a more extensible manner, per-platform.
-      // Here we do the dance of replacing the following sled:
-      //
-      // xray_sled_n:
-      //   ret
-      //   <10 byte nop>
-      //
-      // With the following:
-      //
-      //   mov r10d, <function id>
-      //   jmp <relative 32bit offset to exit trampoline>
-      //
-      // 1. Put the function id first, 2 bytes from the start of the sled (just
-      // after the 1-byte ret instruction).
-      // 2. Put the jmp opcode 6 bytes from the start of the sled.
-      // 3. Put the relative offset 7 bytes from the start of the sled.
-      // 4. Do an atomic write over the jmp instruction for the "mov r10d"
-      // opcode and first operand.
-      //
-      // Prerequisite is to compute the relative offset fo the
-      // __xray_FunctionExit function's address.
-      int64_t TrampolineOffset =
-          reinterpret_cast<int64_t>(__xray_FunctionExit) -
-          (static_cast<int64_t>(Sled.Address) + 11);
-      if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
-        Report("XRay Exit trampoline (%p) too far from sled (%p); distance = "
-               "%ld\n",
-               __xray_FunctionExit, reinterpret_cast<void *>(Sled.Address),
-               TrampolineOffset);
-        continue;
-      }
-      if (Enable) {
-        *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
-        *reinterpret_cast<uint8_t *>(Sled.Address + 6) = JmpOpCode;
-        *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
-        std::atomic_store_explicit(
-            reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
-            std::memory_order_release);
-      } else {
-        std::atomic_store_explicit(
-            reinterpret_cast<std::atomic<uint8_t> *>(Sled.Address), RetOpCode,
-            std::memory_order_release);
-        // FIXME: Write out the nops still?
-      }
+    bool Success = false;
+    switch(Sled.Kind) {
+    case XRayEntryType::ENTRY:
+      Success = patchFunctionEntry(Enable, FuncId, Sled);
+      break;
+    case XRayEntryType::EXIT:
+      Success = patchFunctionExit(Enable, FuncId, Sled);
+      break;
+    default:
+      Report("Unsupported sled kind: %d", int(Sled.Kind));
+      continue;
     }
+    (void)Success;
   }
   XRayPatching.store(false, std::memory_order_release);
   PatchingSuccess = true;
Index: compiler-rt/trunk/lib/xray/xray_interface_internal.h
===================================================================
--- compiler-rt/trunk/lib/xray/xray_interface_internal.h
+++ compiler-rt/trunk/lib/xray/xray_interface_internal.h
@@ -16,18 +16,30 @@
 #define XRAY_INTERFACE_INTERNAL_H
 
 #include "xray/xray_interface.h"
+#include "sanitizer_common/sanitizer_platform.h"
 #include <cstddef>
 #include <cstdint>
 
 extern "C" {
 
 struct XRaySledEntry {
+#if SANITIZER_WORDSIZE == 64
   uint64_t Address;
   uint64_t Function;
   unsigned char Kind;
   unsigned char AlwaysInstrument;
   unsigned char Padding[14]; // Need 32 bytes
+#elif SANITIZER_WORDSIZE == 32
+  uint32_t Address;
+  uint32_t Function;
+  unsigned char Kind;
+  unsigned char AlwaysInstrument;
+  unsigned char Padding[6]; // Need 16 bytes
+#else
+	#error "Unsupported word size."
+#endif
 };
+
 }
 
 namespace __xray {
@@ -37,6 +49,16 @@
   size_t Entries;
 };
 
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, const XRaySledEntry& Sled);
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId, const XRaySledEntry& Sled);
+
 } // namespace __xray
 
+extern "C" {
+// The following functions have to be defined in assembler, on a per-platform
+// basis. See xray_trampoline_*.S files for implementations.
+extern void __xray_FunctionEntry();
+extern void __xray_FunctionExit();
+}
+
 #endif
Index: compiler-rt/trunk/lib/xray/xray_trampoline_arm.S
===================================================================
--- compiler-rt/trunk/lib/xray/xray_trampoline_arm.S
+++ compiler-rt/trunk/lib/xray/xray_trampoline_arm.S
@@ -0,0 +1,65 @@
+    .syntax unified
+    .arch armv7
+    .fpu vfpv3
+    .code 32
+    .global _ZN6__xray19XRayPatchedFunctionE
+    @ Word-aligned function entry point
+    .p2align 2
+    @ Let C/C++ see the symbol
+    .global __xray_FunctionEntry
+    @ It preserves all registers except r0, r12(ip), r14(lr) and r15(pc)
+    @ Assume that "q" part of the floating-point registers is not used
+    @   for passing parameters to C/C++ functions.
+    .type __xray_FunctionEntry, %function
+    @ In C++ it is void extern "C" __xray_FunctionEntry(uint32_t FuncId) with
+    @   FuncId passed in r0 register.
+__xray_FunctionEntry:
+    PUSH {r1-r3,lr}
+    @ Save floating-point parameters of the instrumented function
+    VPUSH {d0-d7}
+    MOVW r1,#:lower16:_ZN6__xray19XRayPatchedFunctionE
+    MOVT r1,#:upper16:_ZN6__xray19XRayPatchedFunctionE
+    LDR r2, [r1]
+    @ Handler address is nullptr if handler is not set
+    CMP r2, #0
+    BEQ FunctionEntry_restore
+    @ Function ID is already in r0 (the first parameter).
+    @ r1=0 means that we are tracing an entry event
+    MOV r1, #0
+    @ Call the handler with 2 parameters in r0 and r1
+    BLX r2
+FunctionEntry_restore:
+    @ Restore floating-point parameters of the instrumented function
+    VPOP {d0-d7}
+    POP {r1-r3,pc}
+
+    @ Word-aligned function entry point
+    .p2align 2
+    @ Let C/C++ see the symbol
+	.global __xray_FunctionExit
+	@ Assume that d1-d7 are not used for the return value.
+    @ Assume that "q" part of the floating-point registers is not used for the
+    @   return value in C/C++.
+	.type __xray_FunctionExit, %function
+	@ In C++ it is extern "C" void __xray_FunctionExit(uint32_t FuncId) with
+    @   FuncId passed in r0 register.
+__xray_FunctionExit:
+    PUSH {r1-r3,lr}
+    @ Save the floating-point return value of the instrumented function
+    VPUSH {d0}
+    @ Load the handler address
+    MOVW r1,#:lower16:_ZN6__xray19XRayPatchedFunctionE
+    MOVT r1,#:upper16:_ZN6__xray19XRayPatchedFunctionE
+    LDR r2, [r1]
+    @ Handler address is nullptr if handler is not set
+    CMP r2, #0
+    BEQ FunctionExit_restore
+    @ Function ID is already in r0 (the first parameter).
+    @ 1 means that we are tracing an exit event
+    MOV r1, #1
+    @ Call the handler with 2 parameters in r0 and r1
+    BLX r2
+FunctionExit_restore:
+    @ Restore the floating-point return value of the instrumented function
+    VPOP {d0}
+    POP {r1-r3,pc}
Index: compiler-rt/trunk/lib/xray/xray_x86_64.cc
===================================================================
--- compiler-rt/trunk/lib/xray/xray_x86_64.cc
+++ compiler-rt/trunk/lib/xray/xray_x86_64.cc
@@ -0,0 +1,116 @@
+#include "xray_interface_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include <atomic>
+#include <cstdint>
+#include <limits>
+
+namespace __xray {
+
+static constexpr uint8_t CallOpCode = 0xe8;
+static constexpr uint16_t MovR10Seq = 0xba41;
+static constexpr uint16_t Jmp9Seq = 0x09eb;
+static constexpr uint8_t JmpOpCode = 0xe9;
+static constexpr uint8_t RetOpCode = 0xc3;
+
+static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
+static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, const XRaySledEntry& Sled)
+{
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   jmp +9
+  //   <9 byte nop>
+  //
+  // With the following:
+  //
+  //   mov r10d, <function id>
+  //   call <relative 32bit offset to entry trampoline>
+  //
+  // We need to do this in the following order:
+  //
+  // 1. Put the function id first, 2 bytes from the start of the sled (just
+  // after the 2-byte jmp instruction).
+  // 2. Put the call opcode 6 bytes from the start of the sled.
+  // 3. Put the relative offset 7 bytes from the start of the sled.
+  // 4. Do an atomic write over the jmp instruction for the "mov r10d"
+  // opcode and first operand.
+  //
+  // Prerequisite is to compute the relative offset to the
+  // __xray_FunctionEntry function's address.
+  int64_t TrampolineOffset =
+      reinterpret_cast<int64_t>(__xray_FunctionEntry) -
+      (static_cast<int64_t>(Sled.Address) + 11);
+  if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
+    Report("XRay Entry trampoline (%p) too far from sled (%p); distance = "
+           "%ld\n",
+           __xray_FunctionEntry, reinterpret_cast<void *>(Sled.Address),
+           TrampolineOffset);
+    return false;
+  }
+  if (Enable) {
+    *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
+    *reinterpret_cast<uint8_t *>(Sled.Address + 6) = CallOpCode;
+    *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp9Seq,
+        std::memory_order_release);
+    // FIXME: Write out the nops still?
+  }
+  return true;
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId, const XRaySledEntry& Sled)
+{
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   ret
+  //   <10 byte nop>
+  //
+  // With the following:
+  //
+  //   mov r10d, <function id>
+  //   jmp <relative 32bit offset to exit trampoline>
+  //
+  // 1. Put the function id first, 2 bytes from the start of the sled (just
+  // after the 1-byte ret instruction).
+  // 2. Put the jmp opcode 6 bytes from the start of the sled.
+  // 3. Put the relative offset 7 bytes from the start of the sled.
+  // 4. Do an atomic write over the jmp instruction for the "mov r10d"
+  // opcode and first operand.
+  //
+  // Prerequisite is to compute the relative offset fo the
+  // __xray_FunctionExit function's address.
+  int64_t TrampolineOffset =
+      reinterpret_cast<int64_t>(__xray_FunctionExit) -
+      (static_cast<int64_t>(Sled.Address) + 11);
+  if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
+    Report("XRay Exit trampoline (%p) too far from sled (%p); distance = "
+           "%ld\n",
+           __xray_FunctionExit, reinterpret_cast<void *>(Sled.Address),
+           TrampolineOffset);
+    return false;
+  }
+  if (Enable) {
+    *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
+    *reinterpret_cast<uint8_t *>(Sled.Address + 6) = JmpOpCode;
+    *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint8_t> *>(Sled.Address), RetOpCode,
+        std::memory_order_release);
+    // FIXME: Write out the nops still?
+  }
+  return true;
+}
+
+} // namespace __xray