Index: cmake/config-ix.cmake =================================================================== --- cmake/config-ix.cmake +++ cmake/config-ix.cmake @@ -161,7 +161,7 @@ set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64}) set(ALL_ESAN_SUPPORTED_ARCH ${X86_64} ${MIPS64}) set(ALL_SCUDO_SUPPORTED_ARCH ${X86_64}) -set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32}) +set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64}) if(APPLE) include(CompilerRTDarwinUtils) Index: lib/xray/CMakeLists.txt =================================================================== --- lib/xray/CMakeLists.txt +++ lib/xray/CMakeLists.txt @@ -19,6 +19,11 @@ set(armhf_SOURCES ${arm_SOURCES}) +set(aarch64_SOURCES + xray_AArch64.cc + xray_trampoline_AArch64.S + ${XRAY_SOURCES}) + include_directories(..) include_directories(../../include) Index: lib/xray/xray_AArch64.cc =================================================================== --- lib/xray/xray_AArch64.cc +++ lib/xray/xray_AArch64.cc @@ -0,0 +1,103 @@ +//===-- xray_AArch64.cc -----------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Implementation of AArch64-specific routines (64-bit). +// +//===----------------------------------------------------------------------===// +#include "xray_interface_internal.h" +#include "sanitizer_common/sanitizer_common.h" +#include +#include + +namespace __xray { + +// The machine codes for some instructions used in runtime patching. +enum class PatchOpcodes : uint32_t { + PO_StpX0X30SP_m16e = 0xA9BF7BE0, // STP X0, X30, [SP, #-16]! + PO_LdrW0_12 = 0x18000060, // LDR W0, #12 + PO_LdrX16_12 = 0x58000070, // LDR X16, #12 + PO_BlrX16 = 0xD63F0200, // BLR X16 + PO_LdpX0X30SP_16 = 0xA8C17BE0, // LDP X0, X30, [SP], #16 + PO_B32 = 0x14000008 // B #32 +}; + +inline static bool patchSled(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled, void (*TracingHook)()) { + // When |Enable| == true, + // We replace the following compile-time stub (sled): + // + // xray_sled_n: + // B #32 + // 7 NOPs (24 bytes) + // + // With the following runtime patch: + // + // xray_sled_n: + // STP X0, X30, [SP, #-16]! ; PUSH {r0, lr} + // LDR W0, #12 ; W0 := function ID + // LDR X16,#12 ; X16 := address of the trampoline + // BLR X16 + // ;DATA: 32 bits of function ID + // ;DATA: lower 32 bits of the address of the trampoline + // ;DATA: higher 32 bits of the address of the trampoline + // LDP X0, X30, [SP], #16 ; POP {r0, lr} + // + // Replacement of the first 4-byte instruction should be the last and atomic + // operation, so that the user code which reaches the sled concurrently + // either jumps over the whole sled, or executes the whole sled when the + // latter is ready. + // + // When |Enable|==false, we set back the first instruction in the sled to be + // B #32 + + uint32_t *FirstAddress = reinterpret_cast(Sled.Address); + if (Enable) { + uint32_t *CurAddress = FirstAddress + 1; + *CurAddress = uint32_t(PatchOpcodes::PO_LdrW0_12); + CurAddress++; + *CurAddress = uint32_t(PatchOpcodes::PO_LdrX16_12); + CurAddress++; + *CurAddress = uint32_t(PatchOpcodes::PO_BlrX16); + CurAddress++; + *CurAddress = FuncId; + CurAddress++; + *reinterpret_cast(CurAddress) = TracingHook; + CurAddress+=2; + *CurAddress = uint32_t(PatchOpcodes::PO_LdpX0X30SP_16); + std::atomic_store_explicit( + reinterpret_cast *>(FirstAddress), + uint32_t(PatchOpcodes::PO_StpX0X30SP_m16e), std::memory_order_release); + } else { + std::atomic_store_explicit( + reinterpret_cast *>(FirstAddress), + uint32_t(PatchOpcodes::PO_B32), std::memory_order_release); + } + return true; +} + +bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) { + return patchSled(Enable, FuncId, Sled, __xray_FunctionEntry); +} + +bool patchFunctionExit(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) { + return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); +} + +bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) { + // FIXME: In the future we'd need to distinguish between non-tail exits and + // tail exits for better information preservation. + return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); +} + +} // namespace __xray Index: lib/xray/xray_inmemory_log.cc =================================================================== --- lib/xray/xray_inmemory_log.cc +++ lib/xray/xray_inmemory_log.cc @@ -27,7 +27,7 @@ #if defined(__x86_64__) #include -#elif defined(__arm__) +#elif defined(__arm__) || defined(__aarch64__) static const int64_t NanosecondsPerSecond = 1000LL * 1000 * 1000; #else #error "Unsupported CPU Architecture" @@ -185,7 +185,7 @@ } else { Report("Unable to determine CPU frequency for TSC accounting."); } -#elif defined(__arm__) +#elif defined(__arm__) || defined(__aarch64__) // There is no instruction like RDTSCP in user mode on ARM. ARM's CP15 does // not have a constant frequency like TSC on x86(_64), it may go faster // or slower depending on CPU turbo or power saving mode. Furthermore, @@ -233,7 +233,7 @@ R.TSC = __rdtscp(&CPU); R.CPU = CPU; } -#elif defined(__arm__) +#elif defined(__arm__) || defined(__aarch64__) { timespec TS; int result = clock_gettime(CLOCK_REALTIME, &TS); Index: lib/xray/xray_interface.cc =================================================================== --- lib/xray/xray_interface.cc +++ lib/xray/xray_interface.cc @@ -26,15 +26,19 @@ namespace __xray { +static const int16_t cSledLength = #if defined(__x86_64__) // FIXME: The actual length is 11 bytes. Why was length 12 passed to mprotect() // ? -static const int16_t cSledLength = 12; + 12 #elif defined(__arm__) -static const int16_t cSledLength = 28; + 28 +#elif defined(__aarch64__) + 32 #else #error "Unsupported CPU Architecture" #endif /* CPU architecture */ +; // This is the function to call when we encounter the entry or exit sleds. std::atomic XRayPatchedFunction{nullptr}; Index: lib/xray/xray_trampoline_AArch64.S =================================================================== --- lib/xray/xray_trampoline_AArch64.S +++ lib/xray/xray_trampoline_AArch64.S @@ -0,0 +1,89 @@ + .text + /* The variable containing the handler function pointer */ + .global _ZN6__xray19XRayPatchedFunctionE + /* Word-aligned function entry point */ + .p2align 2 + /* Let C/C++ see the symbol */ + .global __xray_FunctionEntry + .type __xray_FunctionEntry, %function + /* In C++ it is void extern "C" __xray_FunctionEntry(uint32_t FuncId) with + FuncId passed in W0 register. */ +__xray_FunctionEntry: + /* Move the return address beyond the end of sled data. The 12 bytes of + data are inserted in the code of the runtime patch, between the call + instruction and the instruction returned into. The data contains 32 + bits of instrumented function ID and 64 bits of the address of + the current trampoline. */ + ADD X30, X30, #12 + /* Push the registers which may be modified by the handler function */ + STP X1, X2, [SP, #-16]! + STP X3, X4, [SP, #-16]! + STP X5, X6, [SP, #-16]! + STP X7, X30, [SP, #-16]! + STP Q0, Q1, [SP, #-32]! + STP Q2, Q3, [SP, #-32]! + STP Q4, Q5, [SP, #-32]! + STP Q6, Q7, [SP, #-32]! + /* Load the address of _ZN6__xray19XRayPatchedFunctionE into X1 */ + LDR X1, =_ZN6__xray19XRayPatchedFunctionE + /* Load the handler function pointer into X2 */ + LDR X2, [X1] + /* Handler address is nullptr if handler is not set */ + CMP X2, #0 + BEQ FunctionEntry_restore + /* Function ID is already in W0 (the first parameter). + X1=0 means that we are tracing an entry event */ + MOV X1, #0 + /* Call the handler with 2 parameters in W0 and X1 */ + BLR X2 +FunctionEntry_restore: + /* Pop the saved registers */ + LDP Q6, Q7, [SP], #32 + LDP Q4, Q5, [SP], #32 + LDP Q2, Q3, [SP], #32 + LDP Q0, Q1, [SP], #32 + LDP X7, X30, [SP], #16 + LDP X5, X6, [SP], #16 + LDP X3, X4, [SP], #16 + LDP X1, X2, [SP], #16 + RET + + /* Word-aligned function entry point */ + .p2align 2 + /* Let C/C++ see the symbol */ + .global __xray_FunctionExit + .type __xray_FunctionExit, %function + /* In C++ it is void extern "C" __xray_FunctionExit(uint32_t FuncId) with + FuncId passed in W0 register. */ +__xray_FunctionExit: + /* Move the return address beyond the end of sled data. The 12 bytes of + data are inserted in the code of the runtime patch, between the call + instruction and the instruction returned into. The data contains 32 + bits of instrumented function ID and 64 bits of the address of + the current trampoline. */ + ADD X30, X30, #12 + /* Push the registers which may be modified by the handler function */ + STP X1, X2, [SP, #-16]! + STP X3, X4, [SP, #-16]! + STP X5, X6, [SP, #-16]! + STP X7, X30, [SP, #-16]! + STR Q0, [SP, #-16]! + /* Load the address of _ZN6__xray19XRayPatchedFunctionE into X1 */ + LDR X1, =_ZN6__xray19XRayPatchedFunctionE + /* Load the handler function pointer into X2 */ + LDR X2, [X1] + /* Handler address is nullptr if handler is not set */ + CMP X2, #0 + BEQ FunctionExit_restore + /* Function ID is already in W0 (the first parameter). + X1=1 means that we are tracing an exit event */ + MOV X1, #1 + /* Call the handler with 2 parameters in W0 and X1 */ + BLR X2 +FunctionExit_restore: + LDR Q0, [SP], #16 + LDP X7, X30, [SP], #16 + LDP X5, X6, [SP], #16 + LDP X3, X4, [SP], #16 + LDP X1, X2, [SP], #16 + RET