diff --git a/bolt/include/bolt/Passes/Hugify.h b/bolt/include/bolt/Passes/Hugify.h new file mode 100644 --- /dev/null +++ b/bolt/include/bolt/Passes/Hugify.h @@ -0,0 +1,29 @@ +//===- bolt/Passes/Hugify.h -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_PASSES_HUGIFY_H +#define BOLT_PASSES_HUGIFY_H + +#include "bolt/Passes/BinaryPasses.h" + +namespace llvm { +namespace bolt { + +class HugePage : public BinaryFunctionPass { +public: + HugePage(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) {} + + void runOnFunctions(BinaryContext &BC) override; + + const char *getName() const override { return "HugePage"; } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/include/bolt/RuntimeLibs/HugifyRuntimeLibrary.h b/bolt/include/bolt/RuntimeLibs/HugifyRuntimeLibrary.h --- a/bolt/include/bolt/RuntimeLibs/HugifyRuntimeLibrary.h +++ b/bolt/include/bolt/RuntimeLibs/HugifyRuntimeLibrary.h @@ -22,13 +22,11 @@ public: /// Add custom section names generated by the runtime libraries to \p /// SecNames. - void addRuntimeLibSections(std::vector &SecNames) const final { - SecNames.push_back(".bolt.hugify.entries"); - } + void addRuntimeLibSections(std::vector &SecNames) const final {} void adjustCommandLineOptions(const BinaryContext &BC) const final; - void emitBinary(BinaryContext &BC, MCStreamer &Streamer) final; + void emitBinary(BinaryContext &BC, MCStreamer &Streamer) final {} void link(BinaryContext &BC, StringRef ToolPath, RuntimeDyld &RTDyld, std::function OnLoad) final; diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h --- a/bolt/include/bolt/Utils/CommandLineOpts.h +++ b/bolt/include/bolt/Utils/CommandLineOpts.h @@ -44,6 +44,7 @@ extern llvm::cl::opt HotData; extern llvm::cl::opt HotFunctionsAtEnd; extern llvm::cl::opt HotText; +extern llvm::cl::opt Hugify; extern llvm::cl::opt Instrument; extern llvm::cl::opt OutputFilename; extern llvm::cl::opt PerfData; diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt --- a/bolt/lib/Passes/CMakeLists.txt +++ b/bolt/lib/Passes/CMakeLists.txt @@ -15,6 +15,7 @@ FrameOptimizer.cpp HFSort.cpp HFSortPlus.cpp + Hugify.cpp IdenticalCodeFolding.cpp IndirectCallPromotion.cpp Inliner.cpp diff --git a/bolt/lib/Passes/Hugify.cpp b/bolt/lib/Passes/Hugify.cpp new file mode 100644 --- /dev/null +++ b/bolt/lib/Passes/Hugify.cpp @@ -0,0 +1,50 @@ +//===--- bolt/Passes/Hugify.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "bolt/Passes/Hugify.h" +#include "llvm/Support/CommandLine.h" + +#define DEBUG_TYPE "bolt-hugify" + +using namespace llvm; + +namespace llvm { +namespace bolt { + +void HugePage::runOnFunctions(BinaryContext &BC) { + auto *RtLibrary = BC.getRuntimeLibrary(); + if (!RtLibrary || !BC.isELF() || !BC.StartFunctionAddress) { + return; + } + + auto createSimpleFunction = + [&](std::string Title, std::vector Instrs) -> BinaryFunction * { + BinaryFunction *Func = BC.createInjectedBinaryFunction(Title); + + std::vector> BBs; + BBs.emplace_back(Func->createBasicBlock(nullptr)); + BBs.back()->addInstructions(Instrs.begin(), Instrs.end()); + BBs.back()->setCFIState(0); + BBs.back()->setOffset(BinaryBasicBlock::INVALID_OFFSET); + + Func->insertBasicBlocks(nullptr, std::move(BBs), + /*UpdateLayout=*/true, + /*UpdateCFIState=*/false); + Func->updateState(BinaryFunction::State::CFG_Finalized); + return Func; + }; + + const BinaryFunction *const Start = + BC.getBinaryFunctionAtAddress(*BC.StartFunctionAddress); + assert(Start && "Entry point function not found"); + const MCSymbol *StartSym = Start->getSymbol(); + createSimpleFunction("__bolt_hugify_start_program", + BC.MIB->createSymbolTrampoline(StartSym, BC.Ctx.get())); +} +} // namespace bolt +} // namespace llvm \ No newline at end of file diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -13,6 +13,7 @@ #include "bolt/Passes/AsmDump.h" #include "bolt/Passes/CMOVConversion.h" #include "bolt/Passes/FrameOptimizer.h" +#include "bolt/Passes/Hugify.h" #include "bolt/Passes/IdenticalCodeFolding.h" #include "bolt/Passes/IndirectCallPromotion.h" #include "bolt/Passes/Inliner.h" @@ -333,6 +334,8 @@ if (opts::Instrument) Manager.registerPass(std::make_unique(NeverPrint)); + else if (opts::Hugify) + Manager.registerPass(std::make_unique(NeverPrint)); Manager.registerPass(std::make_unique(NeverPrint)); diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -479,6 +479,11 @@ NextAvailableAddress = alignTo(NextAvailableAddress, BC->PageAlign); NextAvailableOffset = alignTo(NextAvailableOffset, BC->PageAlign); + // Hugify: Additional huge page from left side due to + // weird ASLR mapping addresses (4KB aligned) + if (!BC->HasFixedLoadAddress) + NextAvailableAddress += BC->PageAlign; + if (!opts::UseGnuStack) { // This is where the black magic happens. Creating PHDR table in a segment // other than that containing ELF header is tricky. Some loaders and/or @@ -3700,6 +3705,12 @@ Address = alignTo(Address, Section->getAlignment()); Section->setOutputAddress(Address); Address += Section->getOutputSize(); + + // Hugify: Additional huge page from right side due to + // weird ASLR mapping addresses (4KB aligned) + if (!BC->HasFixedLoadAddress && + Section->getName() == BC->getMainCodeSectionName()) + Address = alignTo(Address, Section->getAlignment()); } // Make sure we allocate enough space for huge pages. diff --git a/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp --- a/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp +++ b/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp @@ -60,35 +60,6 @@ } } -void HugifyRuntimeLibrary::emitBinary(BinaryContext &BC, MCStreamer &Streamer) { - const BinaryFunction *StartFunction = - BC.getBinaryFunctionAtAddress(*(BC.StartFunctionAddress)); - assert(!StartFunction->isFragment() && "expected main function fragment"); - if (!StartFunction) { - errs() << "BOLT-ERROR: failed to locate function at binary start address\n"; - exit(1); - } - - const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/false, - /*IsText=*/false, - /*IsAllocatable=*/true); - MCSectionELF *Section = - BC.Ctx->getELFSection(".bolt.hugify.entries", ELF::SHT_PROGBITS, Flags); - - // __bolt_hugify_init_ptr stores the poiter the hugify library needs to - // jump to after finishing the init code. - MCSymbol *InitPtr = BC.Ctx->getOrCreateSymbol("__bolt_hugify_init_ptr"); - - Section->setAlignment(llvm::Align(BC.RegularPageSize)); - Streamer.switchSection(Section); - - Streamer.emitLabel(InitPtr); - Streamer.emitSymbolAttribute(InitPtr, MCSymbolAttr::MCSA_Global); - Streamer.emitValue( - MCSymbolRefExpr::create(StartFunction->getSymbol(), *(BC.Ctx)), - /*Size=*/8); -} - void HugifyRuntimeLibrary::link(BinaryContext &BC, StringRef ToolPath, RuntimeDyld &RTDyld, std::function OnLoad) { diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt --- a/bolt/runtime/CMakeLists.txt +++ b/bolt/runtime/CMakeLists.txt @@ -25,10 +25,11 @@ -fno-exceptions -fno-rtti -fno-stack-protector - -mno-sse) + -mno-sse + -fPIE) # Don't let the compiler think it can create calls to standard libs -target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS} -fPIE) +target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS}) target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) target_compile_options(bolt_rt_hugify PRIVATE ${BOLT_RT_FLAGS}) target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h --- a/bolt/runtime/common.h +++ b/bolt/runtime/common.h @@ -283,6 +283,22 @@ return Size; } +void *strStr(const char *const Haystack, const char *const Needle) { + int j = 0; + + for (int i = 0; i < strLen(Haystack); i++) { + if (Haystack[i] == Needle[0]) { + for (j = 1; j < strLen(Needle); j++) { + if (Haystack[i + j] != Needle[j]) + break; + } + if (j == strLen(Needle)) + return (void *)&Haystack[i]; + } + } + return nullptr; +} + void reportNumber(const char *Msg, uint64_t Num, uint32_t Base) { char Buf[BufSize]; char *Ptr = Buf; @@ -310,6 +326,25 @@ return Res; } +/// Starting from character at \p buf, find the longest consecutive sequence +/// of digits (0-9) and convert it to uint32_t. The converted value +/// is put into \p ret. \p end marks the end of the buffer to avoid buffer +/// overflow. The function \returns whether a valid uint32_t value is found. +/// \p buf will be updated to the next character right after the digits. +static bool scanUInt32(const char *&Buf, const char *End, uint32_t &Ret) { + uint64_t Result = 0; + const char *OldBuf = Buf; + while (Buf < End && ((*Buf) >= '0' && (*Buf) <= '9')) { + Result = Result * 10 + (*Buf) - '0'; + ++Buf; + } + if (OldBuf != Buf && Result <= 0xFFFFFFFFu) { + Ret = static_cast(Result); + return true; + } + return false; +} + #if !defined(__APPLE__) // We use a stack-allocated buffer for string manipulation in many pieces of // this code, including the code that prints each line of the fdata file. This @@ -387,6 +422,28 @@ return ret; } +#define _UTSNAME_LENGTH 65 + +struct UtsNameTy { + char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ + char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined + network" */ + char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ + char version[_UTSNAME_LENGTH]; /* Operating system version */ + char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ + char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ +}; + +int __uname(struct UtsNameTy *Buf) { + int Ret; + __asm__ __volatile__("movq $63, %%rax\n" + "syscall\n" + : "=a"(Ret) + : "D"(Buf) + : "cc", "rcx", "r11", "memory"); + return Ret; +} + struct timespec { uint64_t tv_sec; /* seconds */ uint64_t tv_nsec; /* nanoseconds */ @@ -482,6 +539,23 @@ return ret; } +// %rdi %rsi %rdx %r10 %r8 +// sys_prctl int option unsigned unsigned unsigned unsigned +// long arg2 long arg3 long arg4 long arg5 +int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, + unsigned long Arg4, unsigned long Arg5) { + int Ret; + register long rdx asm("rdx") = Arg3; + register long r8 asm("r8") = Arg5; + register long r10 asm("r10") = Arg4; + __asm__ __volatile__("movq $157, %%rax\n" + "syscall\n" + : "=a"(Ret) + : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) + :); + return Ret; +} + #endif void reportError(const char *Msg, uint64_t Size) { diff --git a/bolt/runtime/hugify.cpp b/bolt/runtime/hugify.cpp --- a/bolt/runtime/hugify.cpp +++ b/bolt/runtime/hugify.cpp @@ -1,129 +1,181 @@ -//===- bolt/runtime/hugify.cpp --------------------------------------------===// +//===- bolt/runtime/hugify.cpp -------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -//===----------------------------------------------------------------------===// +//===---------------------------------------------------------------------===// #if defined (__x86_64__) #if !defined(__APPLE__) #include "common.h" -#include + +#pragma GCC visibility push(hidden) // Enables a very verbose logging to stderr useful when debugging -//#define ENABLE_DEBUG +// #define ENABLE_DEBUG + +#ifdef ENABLE_DEBUG +#define DEBUG(X) \ + { X; } +#else +#define DEBUG(X) \ + {} +#endif -// Function pointers to init routines in the binary, so we can resume -// regular execution of the function that we hooked. -extern void (*__bolt_hugify_init_ptr)(); +// Function constains trampoline to _start, +// so we can resume regular execution of the function that we hooked. +extern void __bolt_hugify_start_program(); // The __hot_start and __hot_end symbols set by Bolt. We use them to figure // out the rage for marking huge pages. extern uint64_t __hot_start; extern uint64_t __hot_end; -#ifdef MADV_HUGEPAGE +static void getKernelVersion(uint32_t *Val) { + // release should be in the format: %d.%d.%d + // major, minor, release + struct UtsNameTy UtsName; + int Ret = __uname(&UtsName); + const char *Buf = UtsName.release; + const char *End = Buf + strLen(Buf); + const char Delims[2][2] = {".", "."}; + + for (int i = 0; i < 3; ++i) { + if (!scanUInt32(Buf, End, Val[i])) { + return; + } + if (i < sizeof(Delims) / sizeof(Delims[0])) { + const char *Ptr = Delims[i]; + while (*Ptr != '\0') { + if (*Ptr != *Buf) { + return; + } + ++Ptr; + ++Buf; + } + } + } +} + /// Check whether the kernel supports THP via corresponding sysfs entry. -static bool has_pagecache_thp_support() { - char buf[256] = {0}; - const char *madviseStr = "always [madvise] never"; +/// thp works only starting from 5.10 +static bool hasPagecacheTHPSupport() { + char Buf[64]; - int fd = __open("/sys/kernel/mm/transparent_hugepage/enabled", + int FD = __open("/sys/kernel/mm/transparent_hugepage/enabled", 0 /* O_RDONLY */, 0); - if (fd < 0) + if (FD < 0) + return false; + + memset(Buf, 0, sizeof(Buf)); + const size_t Res = __read(FD, Buf, sizeof(Buf)); + if (Res < 0) return false; - size_t res = __read(fd, buf, 256); - if (res < 0) + if (!strStr(Buf, "[always]") && !strStr(Buf, "[madvise]")) return false; - int cmp = strnCmp(buf, madviseStr, strLen(madviseStr)); - return cmp == 0; + struct KernelVersionTy { + uint32_t major; + uint32_t minor; + uint32_t release; + }; + + KernelVersionTy KernelVersion; + + getKernelVersion((uint32_t *)&KernelVersion); + if (KernelVersion.major >= 5 && KernelVersion.minor >= 10) + return true; + + return false; } -static void hugify_for_old_kernel(uint8_t *from, uint8_t *to) { - size_t size = to - from; +static void hugifyForOldKernel(uint8_t *From, uint8_t *AlignedFrom, + uint8_t *AlignedTo) { + const size_t Size = AlignedTo - From; + const size_t AlignedSize = AlignedTo - AlignedFrom; - uint8_t *mem = reinterpret_cast( - __mmap(0, size, 0x3 /* PROT_READ | PROT_WRITE*/, - 0x22 /* MAP_PRIVATE | MAP_ANONYMOUS*/, -1, 0)); + uint8_t *Mem = reinterpret_cast( + __mmap(0, Size, 0x3 /* PROT_READ | PROT_WRITE */, + 0x22 /* MAP_PRIVATE | MAP_ANONYMOUS */, -1, 0)); - if (mem == (void *)MAP_FAILED) { - char msg[] = "Could not allocate memory for text move\n"; - reportError(msg, sizeof(msg)); + if (Mem == ((void *)-1) /* MAP_FAILED */) { + char Msg[] = "[hugify] could not allocate memory for text move\n"; + reportError(Msg, sizeof(Msg)); } -#ifdef ENABLE_DEBUG - reportNumber("Allocated temporary space: ", (uint64_t)mem, 16); -#endif - // Copy the hot code to a temproary location. - memcpy(mem, from, size); + DEBUG(reportNumber("[hugify] allocated temporary address: ", (uint64_t)Mem, + 16);) + DEBUG(reportNumber("[hugify] allocated size: ", (uint64_t)Size, 16);) + + // Copy the hot code to a temporary location. + memcpy(Mem, From, Size); + __prctl(41 /* PR_SET_THP_DISABLE */, 0, 0, 0, 0); // Maps out the existing hot code. - if (__mmap(reinterpret_cast(from), size, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, - 0) == (void *)MAP_FAILED) { - char msg[] = "failed to mmap memory for large page move terminating\n"; - reportError(msg, sizeof(msg)); + if (__mmap(reinterpret_cast(AlignedFrom), AlignedSize, + 0x3 /* PROT_READ | PROT_WRITE */, + 0x32 /* MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE */, -1, + 0) == ((void *)-1) /*MAP_FAILED*/) { + char Msg[] = + "[hugify] failed to mmap memory for large page move terminating\n"; + reportError(Msg, sizeof(Msg)); } // Mark the hot code page to be huge page. - if (__madvise(from, size, MADV_HUGEPAGE) == -1) { - char msg[] = "failed to allocate large page\n"; - reportError(msg, sizeof(msg)); + if (__madvise(AlignedFrom, AlignedSize, 14 /* MADV_HUGEPAGE */) == -1) { + char Msg[] = "[hugify] setting MADV_HUGEPAGE is failed\n"; + reportError(Msg, sizeof(Msg)); } // Copy the hot code back. - memcpy(from, mem, size); + memcpy(From, Mem, Size); // Change permission back to read-only, ignore failure - __mprotect(from, size, PROT_READ | PROT_EXEC); + __mprotect(AlignedFrom, AlignedSize, 0x5 /* PROT_READ | PROT_EXEC */); - __munmap(mem, size); + __munmap(Mem, Size); } #endif extern "C" void __bolt_hugify_self_impl() { -#ifdef MADV_HUGEPAGE - uint8_t *hotStart = (uint8_t *)&__hot_start; - uint8_t *hotEnd = (uint8_t *)&__hot_end; + uint8_t *HotStart = (uint8_t *)&__hot_start; + uint8_t *HotEnd = (uint8_t *)&__hot_end; // Make sure the start and end are aligned with huge page address - const size_t hugePageBytes = 2L * 1024 * 1024; - uint8_t *from = hotStart - ((intptr_t)hotStart & (hugePageBytes - 1)); - uint8_t *to = hotEnd + (hugePageBytes - 1); - to -= (intptr_t)to & (hugePageBytes - 1); - -#ifdef ENABLE_DEBUG - reportNumber("[hugify] hot start: ", (uint64_t)hotStart, 16); - reportNumber("[hugify] hot end: ", (uint64_t)hotEnd, 16); - reportNumber("[hugify] aligned huge page from: ", (uint64_t)from, 16); - reportNumber("[hugify] aligned huge page to: ", (uint64_t)to, 16); -#endif - - if (!has_pagecache_thp_support()) { - hugify_for_old_kernel(from, to); + const size_t HugePageBytes = 2L * 1024 * 1024; + uint8_t *From = HotStart - ((intptr_t)HotStart & (HugePageBytes - 1)); + uint8_t *To = HotEnd + (HugePageBytes - 1); + To -= (intptr_t)To & (HugePageBytes - 1); + + DEBUG(reportNumber("[hugify] hot start: ", (uint64_t)HotStart, 16);) + DEBUG(reportNumber("[hugify] hot end: ", (uint64_t)HotEnd, 16);) + DEBUG(reportNumber("[hugify] aligned huge page from: ", (uint64_t)From, 16);) + DEBUG(reportNumber("[hugify] aligned huge page to: ", (uint64_t)To, 16);) + + if (!hasPagecacheTHPSupport()) { + DEBUG(report( + "[hugify] workaround with memory alignment for kernel < 5.10\n");) + hugifyForOldKernel(HotStart, From, To); return; } - if (__madvise(from, (to - from), MADV_HUGEPAGE) == -1) { - char msg[] = "failed to allocate large page\n"; + if (__madvise(From, (To - From), 14 /* MADV_HUGEPAGE */) == -1) { + char Msg[] = "[hugify] failed to allocate large page\n"; // TODO: allow user to control the failure behavior. - reportError(msg, sizeof(msg)); + reportError(Msg, sizeof(Msg)); } -#endif } /// This is hooking ELF's entry, it needs to save all machine state. extern "C" __attribute((naked)) void __bolt_hugify_self() { - __asm__ __volatile__(SAVE_ALL - "call __bolt_hugify_self_impl\n" - RESTORE_ALL - "jmp *__bolt_hugify_init_ptr(%%rip)\n" - :::); -} - +#if defined(__x86_64__) + __asm__ __volatile__(SAVE_ALL "call __bolt_hugify_self_impl\n" RESTORE_ALL + "jmp __bolt_hugify_start_program\n" :: + :); +#else + exit(1); #endif +} #endif