diff --git a/bolt/include/bolt/Passes/Hugify.h b/bolt/include/bolt/Passes/Hugify.h new file mode 100644 --- /dev/null +++ b/bolt/include/bolt/Passes/Hugify.h @@ -0,0 +1,29 @@ +//===- bolt/Passes/Hugify.h -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_PASSES_HUGIFY_H +#define BOLT_PASSES_HUGIFY_H + +#include "bolt/Passes/BinaryPasses.h" + +namespace llvm { +namespace bolt { + +class HugePage : public BinaryFunctionPass { +public: + HugePage(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) {} + + void runOnFunctions(BinaryContext &BC) override; + + const char *getName() const override { return "HugePage"; } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h --- a/bolt/include/bolt/Utils/CommandLineOpts.h +++ b/bolt/include/bolt/Utils/CommandLineOpts.h @@ -43,6 +43,7 @@ extern llvm::cl::opt HotData; extern llvm::cl::opt HotFunctionsAtEnd; extern llvm::cl::opt HotText; +extern llvm::cl::opt Hugify; extern llvm::cl::opt Instrument; extern llvm::cl::opt OutputFilename; extern llvm::cl::opt PerfData; diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt --- a/bolt/lib/Passes/CMakeLists.txt +++ b/bolt/lib/Passes/CMakeLists.txt @@ -16,6 +16,7 @@ FrameOptimizer.cpp HFSort.cpp HFSortPlus.cpp + Hugify.cpp IdenticalCodeFolding.cpp IndirectCallPromotion.cpp Inliner.cpp diff --git a/bolt/lib/Passes/Hugify.cpp b/bolt/lib/Passes/Hugify.cpp new file mode 100644 --- /dev/null +++ b/bolt/lib/Passes/Hugify.cpp @@ -0,0 +1,61 @@ +//===--- bolt/Passes/Hugify.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "bolt/Passes/Hugify.h" +#include "llvm/Support/CommandLine.h" + +#define DEBUG_TYPE "bolt-hugify" + +using namespace llvm; + +namespace llvm { +namespace bolt { + +void HugePage::runOnFunctions(BinaryContext &BC) { + auto *RtLibrary = BC.getRuntimeLibrary(); + if (!RtLibrary || !BC.isELF() || !BC.StartFunctionAddress) { + return; + } + const auto Flags = BinarySection::getFlags(/*IsReadOnly=*/true, + /*IsText=*/true, + /*IsAllocatable=*/true); + + BinarySection &HugifySection = BC.registerOrUpdateSection( + ".bolt.hugify.entries", ELF::SHT_PROGBITS, Flags, nullptr); + + auto createSimpleFunction = + [&](std::string Title, std::vector Instrs) -> BinaryFunction * { + BinaryFunction *Func = BC.createInjectedBinaryFunction(Title); + + std::vector> BBs; + BBs.emplace_back(Func->createBasicBlock(nullptr)); + BBs.back()->addInstructions(Instrs.begin(), Instrs.end()); + BBs.back()->setCFIState(0); + BBs.back()->setOffset(BinaryBasicBlock::INVALID_OFFSET); + + Func->insertBasicBlocks(nullptr, std::move(BBs), + /*UpdateLayout=*/true, + /*UpdateCFIState=*/false); + Func->updateState(BinaryFunction::State::CFG_Finalized); + return Func; + }; + + const BinaryFunction *const Start = + BC.getBinaryFunctionAtAddress(*BC.StartFunctionAddress); + assert(Start && "Entry point function not found"); + const MCSymbol *StartSym = Start->getSymbol(); + createSimpleFunction("__bolt_hugify_init_ptr", + BC.MIB->createSymbolTrampoline(StartSym, BC.Ctx.get())); + + HugifySection.setIsFinalized(); +} +} // namespace bolt +} // namespace llvm \ No newline at end of file diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -13,6 +13,7 @@ #include "bolt/Passes/AsmDump.h" #include "bolt/Passes/CMOVConversion.h" #include "bolt/Passes/FrameOptimizer.h" +#include "bolt/Passes/Hugify.h" #include "bolt/Passes/IdenticalCodeFolding.h" #include "bolt/Passes/IndirectCallPromotion.h" #include "bolt/Passes/Inliner.h" @@ -315,6 +316,8 @@ if (opts::Instrument) Manager.registerPass(std::make_unique(NeverPrint)); + else if (opts::Hugify) + Manager.registerPass(std::make_unique(NeverPrint)); // Here we manage dependencies/order manually, since passes are run in the // order they're registered. diff --git a/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp --- a/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp +++ b/bolt/lib/RuntimeLibs/HugifyRuntimeLibrary.cpp @@ -81,12 +81,10 @@ Section->setAlignment(llvm::Align(BC.RegularPageSize)); Streamer.switchSection(Section); - - Streamer.emitLabel(InitPtr); Streamer.emitSymbolAttribute(InitPtr, MCSymbolAttr::MCSA_Global); Streamer.emitValue( MCSymbolRefExpr::create(StartFunction->getSymbol(), *(BC.Ctx)), - /*Size=*/8); + /*Size=*/BC.AsmInfo->getCodePointerSize()); } void HugifyRuntimeLibrary::link(BinaryContext &BC, StringRef ToolPath, diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt --- a/bolt/runtime/CMakeLists.txt +++ b/bolt/runtime/CMakeLists.txt @@ -29,7 +29,7 @@ # Don't let the compiler think it can create calls to standard libs target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS} -fPIE) target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) -target_compile_options(bolt_rt_hugify PRIVATE ${BOLT_RT_FLAGS}) +target_compile_options(bolt_rt_hugify PRIVATE ${BOLT_RT_FLAGS} -fPIE) target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) install(TARGETS bolt_rt_instr DESTINATION lib) diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h --- a/bolt/runtime/common.h +++ b/bolt/runtime/common.h @@ -281,6 +281,22 @@ return Size; } +void *strStr(const char *const haystack, const char *const needle) { + int j = 0; + + for (int i = 0; i < strLen(haystack); i++) { + if (haystack[i] == needle[0]) { + for (j = 1; j < strLen(needle); j++) { + if (haystack[i + j] != needle[j]) + break; + } + if (j == strLen(needle)) + return (void *)&haystack[i]; + } + } + return nullptr; +} + void reportNumber(const char *Msg, uint64_t Num, uint32_t Base) { char Buf[BufSize]; char *Ptr = Buf; @@ -385,6 +401,28 @@ return ret; } +#define _UTSNAME_LENGTH 65 + +struct utsname { + char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ + char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined + network" */ + char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ + char version[_UTSNAME_LENGTH]; /* Operating system version */ + char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ + char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ +}; + +int __uname(struct utsname *buf) { + int ret; + __asm__ __volatile__("movq $63, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(buf) + : "cc", "rcx", "r11", "memory"); + return ret; +} + struct timespec { uint64_t tv_sec; /* seconds */ uint64_t tv_nsec; /* nanoseconds */ @@ -480,6 +518,23 @@ return ret; } +// %rdi %rsi %rdx %r10 %r8 +// sys_prctl int option unsigned unsigned unsigned unsigned +// long arg2 long arg3 long arg4 long arg5 +int __prctl(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) { + int ret; + register long rdx asm("rdx") = arg3; + register long r8 asm("r8") = arg5; + register long r10 asm("r10") = arg4; + __asm__ __volatile__("movq $157, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(option), "S"(arg2), "d"(rdx), "r"(r10), "r"(r8) + :); + return ret; +} + #endif void reportError(const char *Msg, uint64_t Size) { diff --git a/bolt/runtime/hugify.cpp b/bolt/runtime/hugify.cpp --- a/bolt/runtime/hugify.cpp +++ b/bolt/runtime/hugify.cpp @@ -1,8 +1,11 @@ -//===- bolt/runtime/hugify.cpp --------------------------------------------===// +//===-- hugify.cpp ----------------------------------------------*- C++ -*-===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// This file contains code that is linked to the final binary with a function +// that is called at program entry to put hot code into a huge page. // //===----------------------------------------------------------------------===// @@ -10,69 +13,134 @@ #if !defined(__APPLE__) #include "common.h" -#include // Enables a very verbose logging to stderr useful when debugging -//#define ENABLE_DEBUG +// #define ENABLE_DEBUG -// Function pointers to init routines in the binary, so we can resume -// regular execution of the function that we hooked. -extern void (*__bolt_hugify_init_ptr)(); +// Function constains trampoline to _start, +// so we can resume regular execution of the function that we hooked. +extern void __bolt_hugify_init_ptr(); // The __hot_start and __hot_end symbols set by Bolt. We use them to figure // out the rage for marking huge pages. extern uint64_t __hot_start; extern uint64_t __hot_end; -#ifdef MADV_HUGEPAGE +/// Starting from character at \p buf, find the longest consecutive sequence +/// of digits (0-9) and convert it to uint32_t. The converted value +/// is put into \p ret. \p end marks the end of the buffer to avoid buffer +/// overflow. The function \returns whether a valid uint32_t value is found. +/// \p buf will be updated to the next character right after the digits. +static bool scanUInt32(const char *&buf, const char *end, uint32_t &ret) { + uint64_t result = 0; + const char *oldBuf = buf; + while (buf < end && ((*buf) >= '0' && (*buf) <= '9')) { + result = result * 10 + (*buf) - '0'; + ++buf; + } + if (oldBuf != buf && result <= 0xFFFFFFFFu) { + ret = static_cast(result); + return true; + } + return false; +} + +static void get_kernel_version(uint32_t *val) { + // release should be in the format: %d.%d.%d + // major, minor, release + struct utsname u; + int ret = __uname(&u); + const char *buf = u.release; + const char *end = buf + strLen(buf); + const char delims[2][2] = {".", "."}; + + for (int i = 0; i < 3; ++i) { + if (!scanUInt32(buf, end, val[i])) { + return; + } + if (i < sizeof(delims) / sizeof(delims[0])) { + const char *ptr = delims[i]; + while (*ptr != '\0') { + if (*ptr != *buf) { + return; + } + ++ptr; + ++buf; + } + } + } +} + /// Check whether the kernel supports THP via corresponding sysfs entry. +/// thp works only starting from 5.10 static bool has_pagecache_thp_support() { - char buf[256] = {0}; - const char *madviseStr = "always [madvise] never"; + char buf[64] = {0}; + const uint64_t madvise_options = 2; + const char *const madviseOpt[madvise_options] = {"[always]", "[madvise]"}; int fd = __open("/sys/kernel/mm/transparent_hugepage/enabled", 0 /* O_RDONLY */, 0); if (fd < 0) return false; - size_t res = __read(fd, buf, 256); + size_t res = __read(fd, buf, sizeof(buf)); if (res < 0) return false; - int cmp = strnCmp(buf, madviseStr, strLen(madviseStr)); - return cmp == 0; -} + typedef struct { + uint32_t major; + uint32_t minor; + uint32_t release; + } kernel_version_t; + + kernel_version_t kernel_version; -static void hugify_for_old_kernel(uint8_t *from, uint8_t *to) { - size_t size = to - from; + get_kernel_version((uint32_t *)&kernel_version); + for (unsigned int i = 0; i < madvise_options; i++) { + if (strStr(buf, madviseOpt[i]) && kernel_version.major >= 5 && + kernel_version.minor >= 10) { + return true; + } + } + return false; +} + +static void hugify_for_old_kernel(uint8_t *from, uint8_t *to, + uint8_t *fromAlignedPage, + uint8_t *toAlignedPage) { + const size_t size = to - from; uint8_t *mem = reinterpret_cast( - __mmap(0, size, 0x3 /* PROT_READ | PROT_WRITE*/, - 0x22 /* MAP_PRIVATE | MAP_ANONYMOUS*/, -1, 0)); + __mmap(0, size, 0x3 /* PROT_READ | PROT_WRITE */, + 0x22 /* MAP_PRIVATE | MAP_ANONYMOUS */, -1, 0)); - if (mem == (void *)MAP_FAILED) { - char msg[] = "Could not allocate memory for text move\n"; + if (mem == ((void *)-1) /* MAP_FAILED */) { + char msg[] = "[hugify] could not allocate memory for text move\n"; reportError(msg, sizeof(msg)); } + #ifdef ENABLE_DEBUG - reportNumber("Allocated temporary space: ", (uint64_t)mem, 16); + reportNumber("[hugify] allocated temporary space: ", (uint64_t)mem, 16); #endif // Copy the hot code to a temproary location. memcpy(mem, from, size); + __prctl(41 /* PR_SET_THP_DISABLE */, 0, 0, 0, 0); // Maps out the existing hot code. - if (__mmap(reinterpret_cast(from), size, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, - 0) == (void *)MAP_FAILED) { - char msg[] = "failed to mmap memory for large page move terminating\n"; + if (__mmap(reinterpret_cast(fromAlignedPage), + toAlignedPage - fromAlignedPage, 0x3 /* PROT_READ | PROT_WRITE */, + 0x32 /* MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE */, -1, + 0) == ((void *)-1) /*MAP_FAILED*/) { + char msg[] = + "[hugify] failed to mmap memory for large page move terminating\n"; reportError(msg, sizeof(msg)); } // Mark the hot code page to be huge page. - if (__madvise(from, size, MADV_HUGEPAGE) == -1) { - char msg[] = "failed to allocate large page\n"; + if (__madvise(fromAlignedPage, toAlignedPage - fromAlignedPage, + 14 /* MADV_HUGEPAGE */) == -1) { + char msg[] = "[hugify] failed to allocate large page\n"; reportError(msg, sizeof(msg)); } @@ -80,14 +148,14 @@ memcpy(from, mem, size); // Change permission back to read-only, ignore failure - __mprotect(from, size, PROT_READ | PROT_EXEC); + __mprotect(fromAlignedPage, toAlignedPage - fromAlignedPage, + 0x5 /* PROT_READ | PROT_EXEC */); __munmap(mem, size); } #endif extern "C" void __bolt_hugify_self_impl() { -#ifdef MADV_HUGEPAGE uint8_t *hotStart = (uint8_t *)&__hot_start; uint8_t *hotEnd = (uint8_t *)&__hot_end; // Make sure the start and end are aligned with huge page address @@ -104,26 +172,30 @@ #endif if (!has_pagecache_thp_support()) { - hugify_for_old_kernel(from, to); +#ifdef ENABLE_DEBUG + report("[hugify] workaround with memory alignment for kernel < 5.10\n"); +#endif + hugify_for_old_kernel(hotStart, hotEnd, from, to); return; } - if (__madvise(from, (to - from), MADV_HUGEPAGE) == -1) { - char msg[] = "failed to allocate large page\n"; + if (__madvise(from, (to - from), 14 /* MADV_HUGEPAGE */) == -1) { + char msg[] = "[hugify] failed to allocate large page\n"; // TODO: allow user to control the failure behavior. reportError(msg, sizeof(msg)); } -#endif } /// This is hooking ELF's entry, it needs to save all machine state. extern "C" __attribute((naked)) void __bolt_hugify_self() { +#if defined(__x86_64__) __asm__ __volatile__(SAVE_ALL "call __bolt_hugify_self_impl\n" RESTORE_ALL - "jmp *__bolt_hugify_init_ptr(%%rip)\n" + "jmp __bolt_hugify_init_ptr\n" :::); -} - +#else + exit(1); #endif +} #endif