Index: bolt/CMakeLists.txt =================================================================== --- bolt/CMakeLists.txt +++ bolt/CMakeLists.txt @@ -32,10 +32,10 @@ endforeach() set(BOLT_ENABLE_RUNTIME_default OFF) -if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" +if ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" + OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") AND (CMAKE_SYSTEM_NAME STREQUAL "Linux" - OR CMAKE_SYSTEM_NAME STREQUAL "Darwin") - AND "X86" IN_LIST BOLT_TARGETS_TO_BUILD) + OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")) set(BOLT_ENABLE_RUNTIME_default ON) endif() option(BOLT_ENABLE_RUNTIME "Enable BOLT runtime" ${BOLT_ENABLE_RUNTIME_default}) Index: bolt/runtime/CMakeLists.txt =================================================================== --- bolt/runtime/CMakeLists.txt +++ bolt/runtime/CMakeLists.txt @@ -27,8 +27,10 @@ -fno-exceptions -fno-rtti -fno-stack-protector - -mno-sse -fPIC) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse") +endif() # Don't let the compiler think it can create calls to standard libs target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS}) Index: bolt/runtime/common.h =================================================================== --- bolt/runtime/common.h +++ bolt/runtime/common.h @@ -6,10 +6,6 @@ // //===----------------------------------------------------------------------===// -#if !defined(__x86_64__) -#error "For x86_64 only" -#endif - #if defined(__linux__) #include @@ -44,44 +40,6 @@ #error "For Linux or MacOS only" #endif -// Save all registers while keeping 16B stack alignment -#define SAVE_ALL \ - "push %%rax\n" \ - "push %%rbx\n" \ - "push %%rcx\n" \ - "push %%rdx\n" \ - "push %%rdi\n" \ - "push %%rsi\n" \ - "push %%rbp\n" \ - "push %%r8\n" \ - "push %%r9\n" \ - "push %%r10\n" \ - "push %%r11\n" \ - "push %%r12\n" \ - "push %%r13\n" \ - "push %%r14\n" \ - "push %%r15\n" \ - "sub $8, %%rsp\n" - -// Mirrors SAVE_ALL -#define RESTORE_ALL \ - "add $8, %%rsp\n" \ - "pop %%r15\n" \ - "pop %%r14\n" \ - "pop %%r13\n" \ - "pop %%r12\n" \ - "pop %%r11\n" \ - "pop %%r10\n" \ - "pop %%r9\n" \ - "pop %%r8\n" \ - "pop %%rbp\n" \ - "pop %%rsi\n" \ - "pop %%rdi\n" \ - "pop %%rdx\n" \ - "pop %%rcx\n" \ - "pop %%rbx\n" \ - "pop %%rax\n" - #define PROT_READ 0x1 /* Page can be read. */ #define PROT_WRITE 0x2 /* Page can be written. */ #define PROT_EXEC 0x4 /* Page can be executed. */ @@ -154,127 +112,41 @@ // Anonymous namespace covering everything but our library entry point namespace { -constexpr uint32_t BufSize = 10240; - -#define _STRINGIFY(x) #x -#define STRINGIFY(x) _STRINGIFY(x) - -uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { - uint64_t ret; -#if defined(__APPLE__) -#define READ_SYSCALL 0x2000003 -#else -#define READ_SYSCALL 0 -#endif - __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd), "S"(buf), "d"(count) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { - uint64_t ret; -#if defined(__APPLE__) -#define WRITE_SYSCALL 0x2000004 -#else -#define WRITE_SYSCALL 1 -#endif - __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd), "S"(buf), "d"(count) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, - uint64_t fd, uint64_t offset) { -#if defined(__APPLE__) -#define MMAP_SYSCALL 0x20000c5 -#else -#define MMAP_SYSCALL 9 -#endif - void *ret; - register uint64_t r8 asm("r8") = fd; - register uint64_t r9 asm("r9") = offset; - register uint64_t r10 asm("r10") = flags; - __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), - "r"(r9) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __munmap(void *addr, uint64_t size) { -#if defined(__APPLE__) -#define MUNMAP_SYSCALL 0x2000049 -#else -#define MUNMAP_SYSCALL 11 -#endif - uint64_t ret; - __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(addr), "S"(size) - : "cc", "rcx", "r11", "memory"); - return ret; -} +struct dirent64 { + uint64_t d_ino; /* Inode number */ + int64_t d_off; /* Offset to next linux_dirent */ + unsigned short d_reclen; /* Length of this linux_dirent */ + unsigned char d_type; + char d_name[]; /* Filename (null-terminated) */ + /* length is actually (d_reclen - 2 - + offsetof(struct linux_dirent, d_name)) */ +}; -#define SIG_BLOCK 0 -#define SIG_UNBLOCK 1 -#define SIG_SETMASK 2 +/* Length of the entries in `struct utsname' is 65. */ +#define _UTSNAME_LENGTH 65 -static const uint64_t MaskAllSignals[] = {-1ULL}; +struct UtsNameTy { + char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ + char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined + network" */ + char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ + char version[_UTSNAME_LENGTH]; /* Operating system version */ + char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ + char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ +}; -uint64_t __sigprocmask(int how, const void *set, void *oldset) { -#if defined(__APPLE__) -#define SIGPROCMASK_SYSCALL 0x2000030 -#else -#define SIGPROCMASK_SYSCALL 14 -#endif - uint64_t ret; - register long r10 asm("r10") = sizeof(uint64_t); - __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(how), "S"(set), "d"(oldset), "r"(r10) - : "cc", "rcx", "r11", "memory"); - return ret; -} +struct timespec { + uint64_t tv_sec; /* seconds */ + uint64_t tv_nsec; /* nanoseconds */ +}; -uint64_t __getpid() { - uint64_t ret; -#if defined(__APPLE__) -#define GETPID_SYSCALL 20 +#if defined(__aarch64__) +#include "sys_aarch64.h" #else -#define GETPID_SYSCALL 39 +#include "sys_x86_64.h" #endif - __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : - : "cc", "rcx", "r11", "memory"); - return ret; -} -uint64_t __exit(uint64_t code) { -#if defined(__APPLE__) -#define EXIT_SYSCALL 0x2000001 -#else -#define EXIT_SYSCALL 231 -#endif - uint64_t ret; - __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(code) - : "cc", "rcx", "r11", "memory"); - return ret; -} +constexpr uint32_t BufSize = 10240; // Helper functions for writing strings to the .fdata file. We intentionally // avoid using libc names to make it clear it is our impl. @@ -390,209 +262,6 @@ return false; } -#if !defined(__APPLE__) -// We use a stack-allocated buffer for string manipulation in many pieces of -// this code, including the code that prints each line of the fdata file. This -// buffer needs to accomodate large function names, but shouldn't be arbitrarily -// large (dynamically allocated) for simplicity of our memory space usage. - -// Declare some syscall wrappers we use throughout this code to avoid linking -// against system libc. -uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { - uint64_t ret; - __asm__ __volatile__("movq $2, %%rax\n" - "syscall" - : "=a"(ret) - : "D"(pathname), "S"(flags), "d"(mode) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -struct dirent { - unsigned long d_ino; /* Inode number */ - unsigned long d_off; /* Offset to next linux_dirent */ - unsigned short d_reclen; /* Length of this linux_dirent */ - char d_name[]; /* Filename (null-terminated) */ - /* length is actually (d_reclen - 2 - - offsetof(struct linux_dirent, d_name)) */ -}; - -long __getdents(unsigned int fd, dirent *dirp, size_t count) { - long ret; - __asm__ __volatile__("movq $78, %%rax\n" - "syscall" - : "=a"(ret) - : "D"(fd), "S"(dirp), "d"(count) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { - uint64_t ret; - __asm__ __volatile__("movq $89, %%rax\n" - "syscall" - : "=a"(ret) - : "D"(pathname), "S"(buf), "d"(bufsize) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { - uint64_t ret; - __asm__ __volatile__("movq $8, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd), "S"(pos), "d"(whence) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __close(uint64_t fd) { - uint64_t ret; - __asm__ __volatile__("movq $3, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __madvise(void *addr, size_t length, int advice) { - int ret; - __asm__ __volatile__("movq $28, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(addr), "S"(length), "d"(advice) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -#define _UTSNAME_LENGTH 65 - -struct UtsNameTy { - char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ - char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined - network" */ - char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ - char version[_UTSNAME_LENGTH]; /* Operating system version */ - char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ - char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ -}; - -int __uname(struct UtsNameTy *Buf) { - int Ret; - __asm__ __volatile__("movq $63, %%rax\n" - "syscall\n" - : "=a"(Ret) - : "D"(Buf) - : "cc", "rcx", "r11", "memory"); - return Ret; -} - -struct timespec { - uint64_t tv_sec; /* seconds */ - uint64_t tv_nsec; /* nanoseconds */ -}; - -uint64_t __nanosleep(const timespec *req, timespec *rem) { - uint64_t ret; - __asm__ __volatile__("movq $35, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(req), "S"(rem) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int64_t __fork() { - uint64_t ret; - __asm__ __volatile__("movq $57, %%rax\n" - "syscall\n" - : "=a"(ret) - : - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __mprotect(void *addr, size_t len, int prot) { - int ret; - __asm__ __volatile__("movq $10, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(addr), "S"(len), "d"(prot) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __getppid() { - uint64_t ret; - __asm__ __volatile__("movq $110, %%rax\n" - "syscall\n" - : "=a"(ret) - : - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __setpgid(uint64_t pid, uint64_t pgid) { - int ret; - __asm__ __volatile__("movq $109, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(pid), "S"(pgid) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __getpgid(uint64_t pid) { - uint64_t ret; - __asm__ __volatile__("movq $121, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(pid) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __kill(uint64_t pid, int sig) { - int ret; - __asm__ __volatile__("movq $62, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(pid), "S"(sig) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __fsync(int fd) { - int ret; - __asm__ __volatile__("movq $74, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -// %rdi %rsi %rdx %r10 %r8 -// sys_prctl int option unsigned unsigned unsigned unsigned -// long arg2 long arg3 long arg4 long arg5 -int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, - unsigned long Arg4, unsigned long Arg5) { - int Ret; - register long rdx asm("rdx") = Arg3; - register long r8 asm("r8") = Arg5; - register long r10 asm("r10") = Arg4; - __asm__ __volatile__("movq $157, %%rax\n" - "syscall\n" - : "=a"(Ret) - : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) - :); - return Ret; -} - -#endif - void reportError(const char *Msg, uint64_t Size) { __write(2, Msg, Size); __exit(1); @@ -609,6 +278,12 @@ reportError(Buf, Ptr - Buf); } +#define SIG_BLOCK 0 +#define SIG_UNBLOCK 1 +#define SIG_SETMASK 2 + +static const uint64_t MaskAllSignals[] = {-1ULL}; + class Mutex { volatile bool InUse{false}; Index: bolt/runtime/instr.cpp =================================================================== --- bolt/runtime/instr.cpp +++ bolt/runtime/instr.cpp @@ -40,7 +40,6 @@ // //===----------------------------------------------------------------------===// -#if defined (__x86_64__) #include "common.h" // Enables a very verbose logging to stderr useful when debugging @@ -669,12 +668,12 @@ assert(static_cast(FDdir) >= 0, "failed to open /proc/self/map_files"); - while (long Nread = __getdents(FDdir, (struct dirent *)Buf, BufSize)) { + while (long Nread = __getdents64(FDdir, (struct dirent64 *)Buf, BufSize)) { assert(static_cast(Nread) != -1, "failed to get folder entries"); - struct dirent *d; + struct dirent64 *d; for (long Bpos = 0; Bpos < Nread; Bpos += d->d_reclen) { - d = (struct dirent *)(Buf + Bpos); + d = (struct dirent64 *)(Buf + Bpos); uint64_t StartAddress, EndAddress; if (!parseAddressRange(d->d_name, StartAddress, EndAddress)) @@ -1630,6 +1629,17 @@ /// as well as the target address for the call extern "C" __attribute((naked)) void __bolt_instr_indirect_call() { +#if defined(__aarch64__) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "ldp x0, x1, [sp, #288]\n" + "bl instrumentIndirectCall\n" + RESTORE_ALL + "ret\n" + :::); + // clang-format on +#else + // clang-format off __asm__ __volatile__(SAVE_ALL "mov 0xa0(%%rsp), %%rdi\n" "mov 0x98(%%rsp), %%rsi\n" @@ -1637,10 +1647,23 @@ RESTORE_ALL "ret\n" :::); + // clang-format on +#endif } extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() { +#if defined(__aarch64__) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "ldp x0, x1, [sp, #288]\n" + "bl instrumentIndirectCall\n" + RESTORE_ALL + "ret\n" + :::); + // clang-format on +#else + // clang-format off __asm__ __volatile__(SAVE_ALL "mov 0x98(%%rsp), %%rdi\n" "mov 0x90(%%rsp), %%rsi\n" @@ -1648,21 +1671,48 @@ RESTORE_ALL "ret\n" :::); + // clang-format on +#endif } /// This is hooking ELF's entry, it needs to save all machine state. extern "C" __attribute((naked)) void __bolt_instr_start() { +#if defined(__aarch64__) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "bl __bolt_instr_setup\n" + RESTORE_ALL + "adrp x16, __bolt_start_trampoline\n" + "add x16, x16, #:lo12:__bolt_start_trampoline\n" + "br x16\n" + :::); + // clang-format on +#else + // clang-format off __asm__ __volatile__(SAVE_ALL "call __bolt_instr_setup\n" RESTORE_ALL "jmp __bolt_start_trampoline\n" :::); + // clang-format on +#endif } /// This is hooking into ELF's DT_FINI extern "C" void __bolt_instr_fini() { - __bolt_fini_trampoline(); +#if defined(__aarch64__) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "adrp x16, __bolt_fini_trampoline\n" + "add x16, x16, #:lo12:__bolt_fini_trampoline\n" + "blr x16\n" + RESTORE_ALL + :::); + // clang-format on +#else + __asm__ __volatile__("call __bolt_fini_trampoline\n" :::); +#endif if (__bolt_instr_sleep_time == 0) __bolt_instr_data_dump(); DEBUG(report("Finished.\n")); @@ -1711,4 +1761,3 @@ } #endif -#endif Index: bolt/runtime/sys_aarch64.h =================================================================== --- /dev/null +++ bolt/runtime/sys_aarch64.h @@ -0,0 +1,362 @@ +#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64 +#define LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64 + +// Save all registers while keeping 16B stack alignment +#define SAVE_ALL \ + "stp x0, x1, [sp, #-16]!\n" \ + "stp x2, x3, [sp, #-16]!\n" \ + "stp x4, x5, [sp, #-16]!\n" \ + "stp x6, x7, [sp, #-16]!\n" \ + "stp x8, x9, [sp, #-16]!\n" \ + "stp x10, x11, [sp, #-16]!\n" \ + "stp x12, x13, [sp, #-16]!\n" \ + "stp x14, x15, [sp, #-16]!\n" \ + "stp x16, x17, [sp, #-16]!\n" \ + "stp x18, x19, [sp, #-16]!\n" \ + "stp x20, x21, [sp, #-16]!\n" \ + "stp x22, x23, [sp, #-16]!\n" \ + "stp x24, x25, [sp, #-16]!\n" \ + "stp x26, x27, [sp, #-16]!\n" \ + "stp x28, x29, [sp, #-16]!\n" \ + "str x30, [sp,#-16]!\n" +// Mirrors SAVE_ALL +#define RESTORE_ALL \ + "ldr x30, [sp], #16\n" \ + "ldp x28, x29, [sp], #16\n" \ + "ldp x26, x27, [sp], #16\n" \ + "ldp x24, x25, [sp], #16\n" \ + "ldp x22, x23, [sp], #16\n" \ + "ldp x20, x21, [sp], #16\n" \ + "ldp x18, x19, [sp], #16\n" \ + "ldp x16, x17, [sp], #16\n" \ + "ldp x14, x15, [sp], #16\n" \ + "ldp x12, x13, [sp], #16\n" \ + "ldp x10, x11, [sp], #16\n" \ + "ldp x8, x9, [sp], #16\n" \ + "ldp x6, x7, [sp], #16\n" \ + "ldp x4, x5, [sp], #16\n" \ + "ldp x2, x3, [sp], #16\n" \ + "ldp x0, x1, [sp], #16\n" + +// Anonymous namespace covering everything but our library entry point +namespace { + +uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = fd; + register const void *x1 __asm__("x1") = buf; + register uint64_t x2 __asm__("x2") = count; + register uint32_t w8 __asm__("w8") = 63; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = fd; + register const void *x1 __asm__("x1") = buf; + register uint64_t x2 __asm__("x2") = count; + register uint32_t w8 __asm__("w8") = 64; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, + uint64_t fd, uint64_t offset) { + void *ret; + register uint64_t x0 __asm__("x0") = addr; + register uint64_t x1 __asm__("x1") = size; + register uint64_t x2 __asm__("x2") = prot; + register uint64_t x3 __asm__("x3") = flags; + register uint64_t x4 __asm__("x4") = fd; + register uint64_t x5 __asm__("x5") = offset; + register uint32_t w8 __asm__("w8") = 222; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __munmap(void *addr, uint64_t size) { + uint64_t ret; + register void *x0 __asm__("x0") = addr; + register uint64_t x1 __asm__("x1") = size; + register uint32_t w8 __asm__("w8") = 215; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __exit(uint64_t code) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = code; + register uint32_t w8 __asm__("w8") = 94; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { + uint64_t ret; + register int x0 __asm__("x0") = -100; + register const char *x1 __asm__("x1") = pathname; + register uint64_t x2 __asm__("x2") = flags; + register uint64_t x3 __asm__("x3") = mode; + register uint32_t w8 __asm__("w8") = 56; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(w8) + : "cc", "memory"); + return ret; +} + +long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { + long ret; + register unsigned int x0 __asm__("x0") = fd; + register dirent64 *x1 __asm__("x1") = dirp; + register size_t x2 __asm__("x2") = count; + register uint32_t w8 __asm__("w8") = 61; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { + uint64_t ret; + register int x0 __asm__("x0") = -100; + register const char *x1 __asm__("x1") = pathname; + register char *x2 __asm__("x2") = buf; + register size_t x3 __asm__("x3") = bufsize; + register uint32_t w8 __asm__("w8") = 78; // readlinkat + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = fd; + register uint64_t x1 __asm__("x1") = pos; + register uint64_t x2 __asm__("x2") = whence; + register uint32_t w8 __asm__("w8") = 62; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __close(uint64_t fd) { + int ret; + register uint64_t x0 __asm__("x0") = fd; + register uint32_t w8 __asm__("w8") = 57; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +int __madvise(void *addr, size_t length, int advice) { + int ret; + register void *x0 __asm__("x0") = addr; + register size_t x1 __asm__("x1") = length; + register int x2 __asm__("x2") = advice; + register uint32_t w8 __asm__("w8") = 233; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __uname(struct UtsNameTy *buf) { + int ret; + register UtsNameTy *x0 __asm__("x0") = buf; + register uint32_t w8 __asm__("w8") = 160; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +uint64_t __nanosleep(const timespec *req, timespec *rem) { + uint64_t ret; + register const timespec *x0 __asm__("x0") = req; + register timespec *x1 __asm__("x1") = rem; + register uint32_t w8 __asm__("w8") = 101; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +int64_t __fork() { + uint64_t ret; + // clone instead of fork with flags + // "CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD" + register uint64_t x0 __asm__("x0") = 0x1200011; + register uint64_t x1 __asm__("x1") = 0; + register uint64_t x2 __asm__("x2") = 0; + register uint64_t x3 __asm__("x3") = 0; + register uint64_t x4 __asm__("x4") = 0; + register uint32_t w8 __asm__("w8") = 220; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(x4), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __mprotect(void *addr, size_t len, int prot) { + int ret; + register void *x0 __asm__("x0") = addr; + register size_t x1 __asm__("x1") = len; + register int x2 __asm__("x2") = prot; + register uint32_t w8 __asm__("w8") = 226; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __getpid() { + uint64_t ret; + register uint32_t w8 __asm__("w8") = 172; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret) + : "r"(w8) + : "cc", "memory", "x0", "x1"); + return ret; +} + +uint64_t __getppid() { + uint64_t ret; + register uint32_t w8 __asm__("w8") = 173; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret) + : "r"(w8) + : "cc", "memory", "x0", "x1"); + return ret; +} + +int __setpgid(uint64_t pid, uint64_t pgid) { + int ret; + register uint64_t x0 __asm__("x0") = pid; + register uint64_t x1 __asm__("x1") = pgid; + register uint32_t w8 __asm__("w8") = 154; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __getpgid(uint64_t pid) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = pid; + register uint32_t w8 __asm__("w8") = 155; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +int __kill(uint64_t pid, int sig) { + int ret; + register uint64_t x0 __asm__("x0") = pid; + register int x1 __asm__("x1") = sig; + register uint32_t w8 __asm__("w8") = 129; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +int __fsync(int fd) { + int ret; + register int x0 __asm__("x0") = fd; + register uint32_t w8 __asm__("w8") = 82; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +uint64_t __sigprocmask(int how, const void *set, void *oldset) { + uint64_t ret; + register int x0 __asm__("x0") = how; + register const void *x1 __asm__("x1") = set; + register void *x2 __asm__("x2") = oldset; + register long x3 asm("x3") = 8; + register uint32_t w8 __asm__("w8") = 135; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __prctl(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) { + int ret; + register int x0 __asm__("x0") = option; + register unsigned long x1 __asm__("x1") = arg2; + register unsigned long x2 __asm__("x2") = arg3; + register unsigned long x3 __asm__("x3") = arg4; + register unsigned long x4 __asm__("x4") = arg5; + register uint32_t w8 __asm__("w8") = 167; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(x4), "r"(w8) + : "cc", "memory"); + return ret; +} + +} // anonymous namespace + +#endif Index: bolt/runtime/sys_x86_64.h =================================================================== --- /dev/null +++ bolt/runtime/sys_x86_64.h @@ -0,0 +1,346 @@ +#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_X86_64 +#define LLVM_TOOLS_LLVM_BOLT_SYS_X86_64 + +// Save all registers while keeping 16B stack alignment +#define SAVE_ALL \ + "push %%rax\n" \ + "push %%rbx\n" \ + "push %%rcx\n" \ + "push %%rdx\n" \ + "push %%rdi\n" \ + "push %%rsi\n" \ + "push %%rbp\n" \ + "push %%r8\n" \ + "push %%r9\n" \ + "push %%r10\n" \ + "push %%r11\n" \ + "push %%r12\n" \ + "push %%r13\n" \ + "push %%r14\n" \ + "push %%r15\n" \ + "sub $8, %%rsp\n" +// Mirrors SAVE_ALL +#define RESTORE_ALL \ + "add $8, %%rsp\n" \ + "pop %%r15\n" \ + "pop %%r14\n" \ + "pop %%r13\n" \ + "pop %%r12\n" \ + "pop %%r11\n" \ + "pop %%r10\n" \ + "pop %%r9\n" \ + "pop %%r8\n" \ + "pop %%rbp\n" \ + "pop %%rsi\n" \ + "pop %%rdi\n" \ + "pop %%rdx\n" \ + "pop %%rcx\n" \ + "pop %%rbx\n" \ + "pop %%rax\n" + +namespace { + +#define _STRINGIFY(x) #x +#define STRINGIFY(x) _STRINGIFY(x) + +uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; +#if defined(__APPLE__) +#define READ_SYSCALL 0x2000003 +#else +#define READ_SYSCALL 0 +#endif + __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(buf), "d"(count) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; +#if defined(__APPLE__) +#define WRITE_SYSCALL 0x2000004 +#else +#define WRITE_SYSCALL 1 +#endif + __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(buf), "d"(count) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, + uint64_t fd, uint64_t offset) { +#if defined(__APPLE__) +#define MMAP_SYSCALL 0x20000c5 +#else +#define MMAP_SYSCALL 9 +#endif + void *ret; + register uint64_t r8 asm("r8") = fd; + register uint64_t r9 asm("r9") = offset; + register uint64_t r10 asm("r10") = flags; + __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), + "r"(r9) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __munmap(void *addr, uint64_t size) { +#if defined(__APPLE__) +#define MUNMAP_SYSCALL 0x2000049 +#else +#define MUNMAP_SYSCALL 11 +#endif + uint64_t ret; + __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(size) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __sigprocmask(int how, const void *set, void *oldset) { +#if defined(__APPLE__) +#define SIGPROCMASK_SYSCALL 0x2000030 +#else +#define SIGPROCMASK_SYSCALL 14 +#endif + uint64_t ret; + register long r10 asm("r10") = sizeof(uint64_t); + __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(how), "S"(set), "d"(oldset), "r"(r10) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getpid() { + uint64_t ret; +#if defined(__APPLE__) +#define GETPID_SYSCALL 20 +#else +#define GETPID_SYSCALL 39 +#endif + __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __exit(uint64_t code) { +#if defined(__APPLE__) +#define EXIT_SYSCALL 0x2000001 +#else +#define EXIT_SYSCALL 231 +#endif + uint64_t ret; + __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(code) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +#if !defined(__APPLE__) +// We use a stack-allocated buffer for string manipulation in many pieces of +// this code, including the code that prints each line of the fdata file. This +// buffer needs to accomodate large function names, but shouldn't be arbitrarily +// large (dynamically allocated) for simplicity of our memory space usage. + +// Declare some syscall wrappers we use throughout this code to avoid linking +// against system libc. +uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { + uint64_t ret; + __asm__ __volatile__("movq $2, %%rax\n" + "syscall" + : "=a"(ret) + : "D"(pathname), "S"(flags), "d"(mode) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { + long ret; + __asm__ __volatile__("movq $217, %%rax\n" + "syscall" + : "=a"(ret) + : "D"(fd), "S"(dirp), "d"(count) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { + uint64_t ret; + __asm__ __volatile__("movq $89, %%rax\n" + "syscall" + : "=a"(ret) + : "D"(pathname), "S"(buf), "d"(bufsize) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { + uint64_t ret; + __asm__ __volatile__("movq $8, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(pos), "d"(whence) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __close(uint64_t fd) { + uint64_t ret; + __asm__ __volatile__("movq $3, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __madvise(void *addr, size_t length, int advice) { + int ret; + __asm__ __volatile__("movq $28, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(length), "d"(advice) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __uname(struct UtsNameTy *Buf) { + int Ret; + __asm__ __volatile__("movq $63, %%rax\n" + "syscall\n" + : "=a"(Ret) + : "D"(Buf) + : "cc", "rcx", "r11", "memory"); + return Ret; +} + +uint64_t __nanosleep(const timespec *req, timespec *rem) { + uint64_t ret; + __asm__ __volatile__("movq $35, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(req), "S"(rem) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int64_t __fork() { + uint64_t ret; + __asm__ __volatile__("movq $57, %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __mprotect(void *addr, size_t len, int prot) { + int ret; + __asm__ __volatile__("movq $10, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(len), "d"(prot) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getpid() { + uint64_t ret; + __asm__ __volatile__("movq $39, %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getppid() { + uint64_t ret; + __asm__ __volatile__("movq $110, %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __setpgid(uint64_t pid, uint64_t pgid) { + int ret; + __asm__ __volatile__("movq $109, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid), "S"(pgid) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getpgid(uint64_t pid) { + uint64_t ret; + __asm__ __volatile__("movq $121, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __kill(uint64_t pid, int sig) { + int ret; + __asm__ __volatile__("movq $62, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid), "S"(sig) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __fsync(int fd) { + int ret; + __asm__ __volatile__("movq $74, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +// %rdi %rsi %rdx %r10 %r8 +// sys_prctl int option unsigned unsigned unsigned unsigned +// long arg2 long arg3 long arg4 long arg5 +int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, + unsigned long Arg4, unsigned long Arg5) { + int Ret; + register long rdx asm("rdx") = Arg3; + register long r8 asm("r8") = Arg5; + register long r10 asm("r10") = Arg4; + __asm__ __volatile__("movq $157, %%rax\n" + "syscall\n" + : "=a"(Ret) + : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) + :); + return Ret; +} + +#endif + +} // anonymous namespace + +#endif