Index: compiler-rt/trunk/lib/xray/xray_trampoline_x86_64.S =================================================================== --- compiler-rt/trunk/lib/xray/xray_trampoline_x86_64.S +++ compiler-rt/trunk/lib/xray/xray_trampoline_x86_64.S @@ -60,6 +60,20 @@ .cfi_def_cfa_offset 8 .endm +.macro ALIGNED_CALL_RAX + // Call the logging handler, after aligning the stack to a 16-byte boundary. + // The approach we're taking here uses additional stack space to stash the + // stack pointer twice before aligning the pointer to 16-bytes. If the stack + // was 8-byte aligned, it will become 16-byte aligned -- when restoring the + // pointer, we can always look -8 bytes from the current position to get + // either of the values we've stashed in the first place. + pushq %rsp + pushq (%rsp) + andq $-0x10, %rsp + callq *%rax + movq 8(%rsp), %rsp +.endm + .text .file "xray_trampoline_x86.S" @@ -82,7 +96,8 @@ // The patched function prolog puts its xray_instr_map index into %r10d. movl %r10d, %edi xor %esi,%esi - callq *%rax + ALIGNED_CALL_RAX + .Ltmp0: RESTORE_REGISTERS retq @@ -113,7 +128,8 @@ movl %r10d, %edi movl $1, %esi - callq *%rax + ALIGNED_CALL_RAX + .Ltmp2: // Restore the important registers. movq 48(%rsp), %rbp @@ -143,7 +159,8 @@ movl %r10d, %edi movl $2, %esi - callq *%rax + + ALIGNED_CALL_RAX .Ltmp4: RESTORE_REGISTERS @@ -181,7 +198,7 @@ // 32-bit function ID becomes the first movl %r10d, %edi - callq *%rax + ALIGNED_CALL_RAX .Larg1entryFail: RESTORE_REGISTERS @@ -207,18 +224,7 @@ testq %rax,%rax je .LcustomEventCleanup - // At this point we know that rcx and rdx already has the data, so we just - // call the logging handler, after aligning the stack to a 16-byte boundary. - // The approach we're taking here uses additional stack space to stash the - // stack pointer twice before aligning the pointer to 16-bytes. If the stack - // was 8-byte aligned, it will become 16-byte aligned -- when restoring the - // pointer, we can always look -8 bytes from the current position to get - // either of the values we've stashed in the first place. - pushq %rsp - pushq (%rsp) - andq $-0x10, %rsp - callq *%rax - movq 8(%rsp), %rsp + ALIGNED_CALL_RAX .LcustomEventCleanup: RESTORE_REGISTERS Index: compiler-rt/trunk/test/xray/TestCases/Linux/common-trampoline-alignment.cc =================================================================== --- compiler-rt/trunk/test/xray/TestCases/Linux/common-trampoline-alignment.cc +++ compiler-rt/trunk/test/xray/TestCases/Linux/common-trampoline-alignment.cc @@ -0,0 +1,57 @@ +// Make sure that we're aligning the stack properly to support handlers that +// expect 16-byte alignment of the stack. +// +// RUN: %clangxx_xray -std=c++11 %s -o %t +// RUN: XRAY_OPTIONS="patch_premain=false verbosity=1 xray_naive_log=false" \ +// RUN: %run %t 2>&1 +// REQUIRES: x86_64-linux +// REQUIRES: built-in-llvm-tree +#include "xray/xray_interface.h" +#include +#include + +[[clang::xray_never_instrument]] __attribute__((weak)) __m128 f(__m128 *i) { + return *i; +} + +[[clang::xray_always_instrument]] __attribute__((noinline)) void noarg() { + __m128 v = {}; + f(&v); +} + +[[ clang::xray_always_instrument, clang::xray_log_args(1) ]] +__attribute__((noinline)) void arg1(int) { + __m128 v = {}; + f(&v); +} + +[[clang::xray_always_instrument]] __attribute__((noinline)) +void no_alignment() {} + +[[clang::xray_never_instrument]] void noarg_handler(int32_t, + XRayEntryType) { + printf("noarg handler called\n"); + __m128 v = {}; + f(&v); +} + +[[clang::xray_never_instrument]] void arg1_handler(int32_t, XRayEntryType, + uint64_t) { + printf("arg1 handler called\n"); + __m128 v = {}; + f(&v); +} + +int main(int argc, char *argv[]) { + __xray_set_handler(noarg_handler); + __xray_set_handler_arg1(arg1_handler); + __xray_patch(); + noarg(); // CHECK: noarg handler called + arg1(argc); // CHECK: arg1 handler called + no_alignment(); + __xray_unpatch(); + __xray_remove_handler(); + __xray_remove_handler_arg1(); + noarg(); + arg1(argc); +}