diff --git a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp --- a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp +++ b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp @@ -63,6 +63,16 @@ "DT_FINI to write the profile\n"; exit(1); } + + if ((opts::InstrumentationWaitForks || opts::InstrumentationSleepTime) && + opts::InstrumentationFileAppendPID) { + errs() + << "BOLT-ERROR: instrumentation-file-append-pid is not compatible with " + "instrumentation-sleep-time and instrumentation-wait-forks. If you " + "want a separate profile for each fork, it can only be dumped in " + "the end of process when instrumentation-file-append-pid is used.\n"; + exit(1); + } } void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC, diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h --- a/bolt/runtime/common.h +++ b/bolt/runtime/common.h @@ -82,6 +82,28 @@ "pop %%rbx\n" \ "pop %%rax\n" +#define PROT_READ 0x1 /* Page can be read. */ +#define PROT_WRITE 0x2 /* Page can be written. */ +#define PROT_EXEC 0x4 /* Page can be executed. */ +#define PROT_NONE 0x0 /* Page can not be accessed. */ +#define PROT_GROWSDOWN \ + 0x01000000 /* Extend change to start of \ + growsdown vma (mprotect only). */ +#define PROT_GROWSUP \ + 0x02000000 /* Extend change to start of \ + growsup vma (mprotect only). */ + +/* Sharing types (must choose one and only one of these). */ +#define MAP_SHARED 0x01 /* Share changes. */ +#define MAP_PRIVATE 0x02 /* Changes are private. */ +#define MAP_FIXED 0x10 /* Interpret addr exactly. */ + +#if defined(__APPLE__) +#define MAP_ANONYMOUS 0x1000 +#else +#define MAP_ANONYMOUS 0x20 +#endif + // Functions that are required by freestanding environment. Compiler may // generate calls to these implicitly. extern "C" { @@ -222,6 +244,21 @@ return ret; } +uint64_t __getpid() { + uint64_t ret; +#if defined(__APPLE__) +#define GETPID_SYSCALL 20 +#else +#define GETPID_SYSCALL 39 +#endif + __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + uint64_t __exit(uint64_t code) { #if defined(__APPLE__) #define EXIT_SYSCALL 0x2000001 @@ -485,16 +522,6 @@ return ret; } -uint64_t __getpid() { - uint64_t ret; - __asm__ __volatile__("movq $39, %%rax\n" - "syscall\n" - : "=a"(ret) - : - : "cc", "rcx", "r11", "memory"); - return ret; -} - uint64_t __getppid() { uint64_t ret; __asm__ __volatile__("movq $110, %%rax\n" diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp --- a/bolt/runtime/instr.cpp +++ b/bolt/runtime/instr.cpp @@ -134,16 +134,10 @@ Lock L(M); if (StackBase == nullptr) { -#if defined(__APPLE__) - int MAP_PRIVATE_MAP_ANONYMOUS = 0x1002; -#else - int MAP_PRIVATE_MAP_ANONYMOUS = 0x22; -#endif + StackBase = reinterpret_cast( - __mmap(0, MaxSize, 0x3 /* PROT_READ | PROT_WRITE*/, - Shared ? 0x21 /*MAP_SHARED | MAP_ANONYMOUS*/ - : MAP_PRIVATE_MAP_ANONYMOUS /* MAP_PRIVATE | MAP_ANONYMOUS*/, - -1, 0)); + __mmap(0, MaxSize, PROT_READ | PROT_WRITE, + (Shared ? MAP_SHARED : MAP_PRIVATE) | MAP_ANONYMOUS, -1, 0)); StackSize = 0; } @@ -215,7 +209,12 @@ /// Used for allocating indirect call instrumentation counters. Initialized by /// __bolt_instr_setup, our initialization routine. -BumpPtrAllocator GlobalAlloc; +BumpPtrAllocator *GlobalAlloc; + +// Storage for GlobalAlloc which can be shared if not using +// instrumentation-file-append-pid. +char *GlobalMetadataStorage; + } // anonymous namespace // User-defined placement new operators. We only use those (as opposed to @@ -235,6 +234,10 @@ memset(Ptr, C, Sz); return Ptr; } + +// Declaration for global allocator to construct it in shared memory if needed. +// Needed because we can't #include +void *operator new(size_t, void *) noexcept; // Only called during exception unwinding (useless). We must manually dealloc. // C++ language weirdness void operator delete(void *Ptr, BumpPtrAllocator &A) { A.deallocate(Ptr); } @@ -248,6 +251,30 @@ struct SimpleHashTableEntryBase { uint64_t Key; uint64_t Val; + void dump(const char *Msg = nullptr) { + char Buf[BufSize]; + char *Ptr = Buf; + Ptr = intToStr(Ptr, __getpid(), 10); + *Ptr++ = ':'; + *Ptr++ = ' '; + if (Msg) + Ptr = strCopy(Ptr, Msg, strLen(Msg)); + *Ptr++ = '0'; + *Ptr++ = 'x'; + Ptr = intToStr(Ptr, (uint64_t)this, 16); + *Ptr++ = ':'; + *Ptr++ = ' '; + Ptr = strCopy(Ptr, "MapEntry(0x", sizeof("MapEntry(0x") - 1); + Ptr = intToStr(Ptr, Key, 16); + *Ptr++ = ','; + *Ptr++ = ' '; + *Ptr++ = '0'; + *Ptr++ = 'x'; + Ptr = intToStr(Ptr, Val, 16); + *Ptr++ = ')'; + *Ptr++ = '\n'; + __write(2, Buf, Ptr - Buf); + } }; /// This hash table implementation starts by allocating a table of size @@ -269,7 +296,17 @@ /// Increment by 1 the value of \p Key. If it is not in this table, it will be /// added to the table and its value set to 1. void incrementVal(uint64_t Key, BumpPtrAllocator &Alloc) { - ++get(Key, Alloc).Val; + if (!__bolt_instr_conservative) { + TryLock L(M); + if (!L.isLocked()) + return; + auto &E = getOrAllocEntry(Key, Alloc); + ++E.Val; + return; + } + Lock L(M); + auto &E = getOrAllocEntry(Key, Alloc); + ++E.Val; } /// Basic member accessing interface. Here we pass the allocator explicitly to @@ -313,10 +350,10 @@ if (Entry.Key == VacantMarker) continue; if (Entry.Key & FollowUpTableMarker) { - forEachElement(Callback, IncSize, - reinterpret_cast(Entry.Key & - ~FollowUpTableMarker), - args...); + MapEntry *Next = + reinterpret_cast(Entry.Key & ~FollowUpTableMarker); + assert(Next != Entries, "Circular reference!"); + forEachElement(Callback, IncSize, Next, args...); continue; } Callback(Entry, args...); @@ -327,11 +364,13 @@ TableRoot = new (Alloc, 0) MapEntry[InitialSize]; MapEntry &Entry = TableRoot[Key % InitialSize]; Entry.Key = Key; + // DEBUG(Entry.dump("Created root entry: ")); return Entry; } MapEntry &getEntry(MapEntry *Entries, uint64_t Key, uint64_t Selector, BumpPtrAllocator &Alloc, int CurLevel) { + // DEBUG(reportNumber("getEntry called, level ", CurLevel, 10)); const uint32_t NumEntries = CurLevel == 0 ? InitialSize : IncSize; uint64_t Remainder = Selector / NumEntries; Selector = Selector % NumEntries; @@ -339,12 +378,14 @@ // A hit if (Entry.Key == Key) { + // DEBUG(Entry.dump("Hit: ")); return Entry; } // Vacant - add new entry if (Entry.Key == VacantMarker) { Entry.Key = Key; + // DEBUG(Entry.dump("Adding new entry: ")); return Entry; } @@ -356,19 +397,32 @@ } // Conflict - create the next level + // DEBUG(Entry.dump("Creating new level: ")); + MapEntry *NextLevelTbl = new (Alloc, 0) MapEntry[IncSize]; + // DEBUG( + // reportNumber("Newly allocated level: 0x", uint64_t(NextLevelTbl), + // 16)); uint64_t CurEntrySelector = Entry.Key / InitialSize; for (int I = 0; I < CurLevel; ++I) CurEntrySelector /= IncSize; CurEntrySelector = CurEntrySelector % IncSize; NextLevelTbl[CurEntrySelector] = Entry; Entry.Key = reinterpret_cast(NextLevelTbl) | FollowUpTableMarker; + assert((NextLevelTbl[CurEntrySelector].Key & ~FollowUpTableMarker) != + uint64_t(Entries), + "circular reference created!\n"); + // DEBUG(NextLevelTbl[CurEntrySelector].dump("New level entry: ")); + // DEBUG(Entry.dump("Updated old entry: ")); return getEntry(NextLevelTbl, Key, Remainder, Alloc, CurLevel + 1); } MapEntry &getOrAllocEntry(uint64_t Key, BumpPtrAllocator &Alloc) { - if (TableRoot) - return getEntry(TableRoot, Key, Key, Alloc, 0); + if (TableRoot) { + MapEntry &E = getEntry(TableRoot, Key, Key, Alloc, 0); + assert(!(E.Key & FollowUpTableMarker), "Invalid entry!"); + return E; + } return firstAllocation(Key, Alloc); } }; @@ -669,7 +723,7 @@ // mmap our binary to memory uint64_t Size = __lseek(FD, 0, 2 /*SEEK_END*/); uint8_t *BinContents = reinterpret_cast( - __mmap(0, Size, 0x1 /* PROT_READ*/, 0x2 /* MAP_PRIVATE*/, FD, 0)); + __mmap(0, Size, PROT_READ, MAP_PRIVATE, FD, 0)); Result.MMapPtr = BinContents; Result.MMapSize = Size; Elf64_Ehdr *Hdr = reinterpret_cast(BinContents); @@ -1553,21 +1607,28 @@ DEBUG(reportNumber("replace mmap start: ", CountersStart, 16)); DEBUG(reportNumber("replace mmap stop: ", CountersEnd, 16)); assert (CountersEnd > CountersStart, "no counters"); - // Maps our counters to be shared instead of private, so we keep counting for - // forked processes - __mmap(CountersStart, CountersEnd - CountersStart, - 0x3 /*PROT_READ|PROT_WRITE*/, - 0x31 /*MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED*/, -1, 0); + + const uint64_t PrivateOrShared = + __bolt_instr_use_pid ? MAP_PRIVATE : MAP_SHARED; + __mmap(CountersStart, CountersEnd - CountersStart, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | PrivateOrShared | MAP_FIXED, -1, 0); __bolt_ind_call_counter_func_pointer = __bolt_instr_indirect_call; __bolt_ind_tailcall_counter_func_pointer = __bolt_instr_indirect_tailcall; - // Conservatively reserve 100MiB shared pages - GlobalAlloc.setMaxSize(0x6400000); - GlobalAlloc.setShared(true); - GlobalWriteProfileMutex = new (GlobalAlloc, 0) Mutex(); + + bool Shared = !__bolt_instr_use_pid; + GlobalMetadataStorage = reinterpret_cast( + __mmap(0, 4096, PROT_READ | PROT_WRITE, + (Shared ? MAP_SHARED : MAP_PRIVATE) | MAP_ANONYMOUS, -1, 0)); + assert(GlobalMetadataStorage, "failed to mmap page for metadata!"); + GlobalAlloc = new (GlobalMetadataStorage) BumpPtrAllocator; + // Conservatively reserve 100MiB + GlobalAlloc->setMaxSize(0x6400000); + GlobalAlloc->setShared(Shared); + GlobalWriteProfileMutex = new (*GlobalAlloc, 0) Mutex(); if (__bolt_instr_num_ind_calls > 0) GlobalIndCallCounters = - new (GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls]; + new (*GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls]; if (__bolt_instr_sleep_time != 0) { // Separate instrumented process to the own process group @@ -1582,7 +1643,7 @@ extern "C" __attribute((force_align_arg_pointer)) void instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) { - GlobalIndCallCounters[IndCallID].incrementVal(Target, GlobalAlloc); + GlobalIndCallCounters[IndCallID].incrementVal(Target, *GlobalAlloc); } /// We receive as in-stack arguments the identifier of the indirect call site diff --git a/bolt/test/runtime/instrumentation-indirect-2.c b/bolt/test/runtime/instrumentation-indirect-2.c new file mode 100644 --- /dev/null +++ b/bolt/test/runtime/instrumentation-indirect-2.c @@ -0,0 +1,136 @@ +// Check that indirect call hash tables properly register multiple calls, +// and that calls from different processes don't get mixed up when using +// --instrumentation-file-append-pid. + +#include +#include +#include + +__attribute__((noinline)) void funcA(int pid) { printf("funcA %d\n", pid); } +__attribute__((noinline)) void funcB(int pid) { printf("funcB %d\n", pid); } +__attribute__((noinline)) void funcC(int pid) { printf("funcC %d\n", pid); } +__attribute__((noinline)) void funcD(int pid) { printf("funcD %d\n", pid); } +__attribute__((noinline)) void funcE(int pid) { printf("funcE %d\n", pid); } +__attribute__((noinline)) void funcF(int pid) { printf("funcF %d\n", pid); } +__attribute__((noinline)) void funcG(int pid) { printf("funcG %d\n", pid); } +__attribute__((noinline)) void funcH(int pid) { printf("funcH %d\n", pid); } +__attribute__((noinline)) void funcI(int pid) { printf("funcI %d\n", pid); } +__attribute__((noinline)) void funcJ(int pid) { printf("funcJ %d\n", pid); } +__attribute__((noinline)) void funcK(int pid) { printf("funcK %d\n", pid); } +__attribute__((noinline)) void funcL(int pid) { printf("funcL %d\n", pid); } +__attribute__((noinline)) void funcM(int pid) { printf("funcM %d\n", pid); } +__attribute__((noinline)) void funcN(int pid) { printf("funcN %d\n", pid); } +__attribute__((noinline)) void funcO(int pid) { printf("funcO %d\n", pid); } +__attribute__((noinline)) void funcP(int pid) { printf("funcP %d\n", pid); } + +int main() { + + void (*funcs[])(int) = {funcA, funcB, funcC, funcD, funcE, funcF, + funcG, funcH, funcI, funcJ, funcK, funcL, + funcM, funcN, funcO, funcP}; + int i; + + switch (fork()) { + case -1: + printf("Failed to fork!\n"); + exit(-1); + break; + case 0: + i = 0; + break; + default: + i = 1; + break; + } + int pid = getpid(); + for (; i < sizeof(funcs) / sizeof(void *); i += 2) { + funcs[i](pid); + } + + return 0; +} +/* +REQUIRES: system-linux + +RUN: %clang %cflags %s -o %t.exe -Wl,-q -pie -fpie + +RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \ +RUN: --conservative-instrumentation -o %t.instrumented_conservative + +# Instrumented program needs to finish returning zero +# Both output and profile must contain all 16 functions +RUN: %t.instrumented_conservative | FileCheck %s --check-prefix=CHECK-ALL +RUN: cat %t.fdata | FileCheck %s --check-prefix=CHECK-ALL + +CHECK-ALL-DAG: funcA +CHECK-ALL-DAG: funcB +CHECK-ALL-DAG: funcC +CHECK-ALL-DAG: funcD +CHECK-ALL-DAG: funcE +CHECK-ALL-DAG: funcF +CHECK-ALL-DAG: funcG +CHECK-ALL-DAG: funcH +CHECK-ALL-DAG: funcI +CHECK-ALL-DAG: funcJ +CHECK-ALL-DAG: funcK +CHECK-ALL-DAG: funcL +CHECK-ALL-DAG: funcM +CHECK-ALL-DAG: funcN +CHECK-ALL-DAG: funcO +CHECK-ALL-DAG: funcP + +RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t \ +RUN: --instrumentation-file-append-pid \ +RUN: -o %t.instrumented + +RUN: %t.instrumented > %t.output +# Make sure all functions were called +RUN: cat %t.output | FileCheck %s --check-prefix=CHECK-ALL + +RUN: child_pid=$(cat %t.output | grep funcA | awk '{print $2;}') +RUN: par_pid=$(cat %t.output | grep funcB | awk '{print $2;}') +RUN: mv %t.$child_pid.fdata %t.child.fdata +RUN: mv %t.$par_pid.fdata %t.parent.fdata + +# Instrumented binary must produce two profiles with only local calls +# recorded. Functions called only in child should not appear in parent's +# process and vice versa. +RUN: cat %t.child.fdata | FileCheck %s --check-prefix=CHECK-CHILD +RUN: cat %t.parent.fdata | FileCheck %s --check-prefix=CHECK-PARENT + +CHECK-CHILD-DAG: funcA +CHECK-CHILD-NOT: funcB +CHECK-CHILD-DAG: funcC +CHECK-CHILD-NOT: funcD +CHECK-CHILD-DAG: funcE +CHECK-CHILD-NOT: funcF +CHECK-CHILD-DAG: funcG +CHECK-CHILD-NOT: funcH +CHECK-CHILD-DAG: funcI +CHECK-CHILD-NOT: funcJ +CHECK-CHILD-DAG: funcK +CHECK-CHILD-NOT: funcL +CHECK-CHILD-DAG: funcM +CHECK-CHILD-NOT: funcN +CHECK-CHILD-DAG: funcO +CHECK-CHILD-NOT: funcP + + +CHECK-PARENT-NOT: funcA +CHECK-PARENT-DAG: funcB +CHECK-PARENT-NOT: funcC +CHECK-PARENT-DAG: funcD +CHECK-PARENT-NOT: funcE +CHECK-PARENT-DAG: funcF +CHECK-PARENT-NOT: funcG +CHECK-PARENT-DAG: funcH +CHECK-PARENT-NOT: funcI +CHECK-PARENT-DAG: funcJ +CHECK-PARENT-NOT: funcK +CHECK-PARENT-DAG: funcL +CHECK-PARENT-NOT: funcM +CHECK-PARENT-DAG: funcN +CHECK-PARENT-NOT: funcO +CHECK-PARENT-DAG: funcP + + */