diff --git a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp --- a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp +++ b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp @@ -63,6 +63,16 @@ "DT_FINI to write the profile\n"; exit(1); } + + if ((opts::InstrumentationWaitForks || opts::InstrumentationSleepTime) && + opts::InstrumentationFileAppendPID) { + errs() + << "BOLT-ERROR: instrumentation-file-append-pid is not compatible with " + "instrumentation-sleep-time and instrumentation-wait-forks. If you " + "want a separate profile for each fork, it can only be dumped in " + "the end of process when instrumentation-file-append-pid is used.\n"; + exit(1); + } } void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC, diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h --- a/bolt/runtime/common.h +++ b/bolt/runtime/common.h @@ -246,6 +246,21 @@ return ret; } +uint64_t __getpid() { + uint64_t ret; +#if defined(__APPLE__) +#define GETPID_SYSCALL 20 +#else +#define GETPID_SYSCALL 39 +#endif + __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + uint64_t __exit(uint64_t code) { #if defined(__APPLE__) #define EXIT_SYSCALL 0x2000001 @@ -509,16 +524,6 @@ return ret; } -uint64_t __getpid() { - uint64_t ret; - __asm__ __volatile__("movq $39, %%rax\n" - "syscall\n" - : "=a"(ret) - : - : "cc", "rcx", "r11", "memory"); - return ret; -} - uint64_t __getppid() { uint64_t ret; __asm__ __volatile__("movq $110, %%rax\n" diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp --- a/bolt/runtime/instr.cpp +++ b/bolt/runtime/instr.cpp @@ -208,7 +208,12 @@ /// Used for allocating indirect call instrumentation counters. Initialized by /// __bolt_instr_setup, our initialization routine. -BumpPtrAllocator GlobalAlloc; +BumpPtrAllocator *GlobalAlloc; + +// Storage for GlobalAlloc which can be shared if not using +// instrumentation-file-append-pid. +void *GlobalMetadataStorage; + } // anonymous namespace // User-defined placement new operators. We only use those (as opposed to @@ -228,6 +233,10 @@ memset(Ptr, C, Sz); return Ptr; } + +// Declaration for global allocator to construct it in shared memory if needed. +// Needed because we can't #include +void *operator new(size_t, void *) noexcept; // Only called during exception unwinding (useless). We must manually dealloc. // C++ language weirdness void operator delete(void *Ptr, BumpPtrAllocator &A) { A.deallocate(Ptr); } @@ -241,6 +250,37 @@ struct SimpleHashTableEntryBase { uint64_t Key; uint64_t Val; + void dump(const char *Msg = nullptr) { + // TODO: make some sort of formatting function + // Currently we have to do it the ugly way because + // we want every message to be printed atomically via a single call to + // __write. If we use reportNumber() and others nultiple times, we'll get + // garbage in mulithreaded environment + char Buf[BufSize]; + char *Ptr = Buf; + Ptr = intToStr(Ptr, __getpid(), 10); + *Ptr++ = ':'; + *Ptr++ = ' '; + if (Msg) + Ptr = strCopy(Ptr, Msg, strLen(Msg)); + *Ptr++ = '0'; + *Ptr++ = 'x'; + Ptr = intToStr(Ptr, (uint64_t)this, 16); + *Ptr++ = ':'; + *Ptr++ = ' '; + Ptr = strCopy(Ptr, "MapEntry(0x", sizeof("MapEntry(0x") - 1); + Ptr = intToStr(Ptr, Key, 16); + *Ptr++ = ','; + *Ptr++ = ' '; + *Ptr++ = '0'; + *Ptr++ = 'x'; + Ptr = intToStr(Ptr, Val, 16); + *Ptr++ = ')'; + *Ptr++ = '\n'; + assert(Ptr - Buf < BufSize, "Buffer overflow!"); + // print everything all at once for atomicity + __write(2, Buf, Ptr - Buf); + } }; /// This hash table implementation starts by allocating a table of size @@ -262,7 +302,17 @@ /// Increment by 1 the value of \p Key. If it is not in this table, it will be /// added to the table and its value set to 1. void incrementVal(uint64_t Key, BumpPtrAllocator &Alloc) { - ++get(Key, Alloc).Val; + if (!__bolt_instr_conservative) { + TryLock L(M); + if (!L.isLocked()) + return; + auto &E = getOrAllocEntry(Key, Alloc); + ++E.Val; + return; + } + Lock L(M); + auto &E = getOrAllocEntry(Key, Alloc); + ++E.Val; } /// Basic member accessing interface. Here we pass the allocator explicitly to @@ -306,10 +356,10 @@ if (Entry.Key == VacantMarker) continue; if (Entry.Key & FollowUpTableMarker) { - forEachElement(Callback, IncSize, - reinterpret_cast(Entry.Key & - ~FollowUpTableMarker), - args...); + MapEntry *Next = + reinterpret_cast(Entry.Key & ~FollowUpTableMarker); + assert(Next != Entries, "Circular reference!"); + forEachElement(Callback, IncSize, Next, args...); continue; } Callback(Entry, args...); @@ -320,11 +370,13 @@ TableRoot = new (Alloc, 0) MapEntry[InitialSize]; MapEntry &Entry = TableRoot[Key % InitialSize]; Entry.Key = Key; + // DEBUG(Entry.dump("Created root entry: ")); return Entry; } MapEntry &getEntry(MapEntry *Entries, uint64_t Key, uint64_t Selector, BumpPtrAllocator &Alloc, int CurLevel) { + // DEBUG(reportNumber("getEntry called, level ", CurLevel, 10)); const uint32_t NumEntries = CurLevel == 0 ? InitialSize : IncSize; uint64_t Remainder = Selector / NumEntries; Selector = Selector % NumEntries; @@ -332,12 +384,14 @@ // A hit if (Entry.Key == Key) { + // DEBUG(Entry.dump("Hit: ")); return Entry; } // Vacant - add new entry if (Entry.Key == VacantMarker) { Entry.Key = Key; + // DEBUG(Entry.dump("Adding new entry: ")); return Entry; } @@ -349,19 +403,32 @@ } // Conflict - create the next level + // DEBUG(Entry.dump("Creating new level: ")); + MapEntry *NextLevelTbl = new (Alloc, 0) MapEntry[IncSize]; + // DEBUG( + // reportNumber("Newly allocated level: 0x", uint64_t(NextLevelTbl), + // 16)); uint64_t CurEntrySelector = Entry.Key / InitialSize; for (int I = 0; I < CurLevel; ++I) CurEntrySelector /= IncSize; CurEntrySelector = CurEntrySelector % IncSize; NextLevelTbl[CurEntrySelector] = Entry; Entry.Key = reinterpret_cast(NextLevelTbl) | FollowUpTableMarker; + assert((NextLevelTbl[CurEntrySelector].Key & ~FollowUpTableMarker) != + uint64_t(Entries), + "circular reference created!\n"); + // DEBUG(NextLevelTbl[CurEntrySelector].dump("New level entry: ")); + // DEBUG(Entry.dump("Updated old entry: ")); return getEntry(NextLevelTbl, Key, Remainder, Alloc, CurLevel + 1); } MapEntry &getOrAllocEntry(uint64_t Key, BumpPtrAllocator &Alloc) { - if (TableRoot) - return getEntry(TableRoot, Key, Key, Alloc, 0); + if (TableRoot) { + MapEntry &E = getEntry(TableRoot, Key, Key, Alloc, 0); + assert(!(E.Key & FollowUpTableMarker), "Invalid entry!"); + return E; + } return firstAllocation(Key, Alloc); } }; @@ -1538,6 +1605,9 @@ /// Initialization code extern "C" void __attribute((force_align_arg_pointer)) __bolt_instr_setup() { + __bolt_ind_call_counter_func_pointer = __bolt_instr_indirect_call; + __bolt_ind_tailcall_counter_func_pointer = __bolt_instr_indirect_tailcall; + const uint64_t CountersStart = reinterpret_cast(&__bolt_instr_locations[0]); const uint64_t CountersEnd = alignTo( @@ -1546,21 +1616,28 @@ DEBUG(reportNumber("replace mmap start: ", CountersStart, 16)); DEBUG(reportNumber("replace mmap stop: ", CountersEnd, 16)); assert (CountersEnd > CountersStart, "no counters"); - // Maps our counters to be shared instead of private, so we keep counting for - // forked processes + + const bool Shared = !__bolt_instr_use_pid; + const uint64_t MapPrivateOrShared = Shared ? MAP_SHARED : MAP_PRIVATE; + void *Ret = __mmap(CountersStart, CountersEnd - CountersStart, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED, -1, 0); + MAP_ANONYMOUS | MapPrivateOrShared | MAP_FIXED, -1, 0); assert(Ret != MAP_FAILED, "Failed to mmap counters!"); - __bolt_ind_call_counter_func_pointer = __bolt_instr_indirect_call; - __bolt_ind_tailcall_counter_func_pointer = __bolt_instr_indirect_tailcall; - // Conservatively reserve 100MiB shared pages - GlobalAlloc.setMaxSize(0x6400000); - GlobalAlloc.setShared(true); - GlobalWriteProfileMutex = new (GlobalAlloc, 0) Mutex(); + + GlobalMetadataStorage = __mmap(0, 4096, PROT_READ | PROT_WRITE, + MapPrivateOrShared | MAP_ANONYMOUS, -1, 0); + assert(GlobalMetadataStorage != MAP_FAILED, + "failed to mmap page for metadata!"); + + GlobalAlloc = new (GlobalMetadataStorage) BumpPtrAllocator; + // Conservatively reserve 100MiB + GlobalAlloc->setMaxSize(0x6400000); + GlobalAlloc->setShared(Shared); + GlobalWriteProfileMutex = new (*GlobalAlloc, 0) Mutex(); if (__bolt_instr_num_ind_calls > 0) GlobalIndCallCounters = - new (GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls]; + new (*GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls]; if (__bolt_instr_sleep_time != 0) { // Separate instrumented process to the own process group @@ -1575,7 +1652,7 @@ extern "C" __attribute((force_align_arg_pointer)) void instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) { - GlobalIndCallCounters[IndCallID].incrementVal(Target, GlobalAlloc); + GlobalIndCallCounters[IndCallID].incrementVal(Target, *GlobalAlloc); } /// We receive as in-stack arguments the identifier of the indirect call site diff --git a/bolt/test/runtime/instrumentation-indirect-2.c b/bolt/test/runtime/instrumentation-indirect-2.c new file mode 100644 --- /dev/null +++ b/bolt/test/runtime/instrumentation-indirect-2.c @@ -0,0 +1,164 @@ +// Check that indirect call hash tables properly register multiple calls, +// and that calls from different processes don't get mixed up when using +// --instrumentation-file-append-pid. + +#include +#include +#include + +__attribute__((noinline)) void funcA(int pid) { printf("funcA %d\n", pid); } +__attribute__((noinline)) void funcB(int pid) { printf("funcB %d\n", pid); } +__attribute__((noinline)) void funcC(int pid) { printf("funcC %d\n", pid); } +__attribute__((noinline)) void funcD(int pid) { printf("funcD %d\n", pid); } +__attribute__((noinline)) void funcE(int pid) { printf("funcE %d\n", pid); } +__attribute__((noinline)) void funcF(int pid) { printf("funcF %d\n", pid); } +__attribute__((noinline)) void funcG(int pid) { printf("funcG %d\n", pid); } +__attribute__((noinline)) void funcH(int pid) { printf("funcH %d\n", pid); } +__attribute__((noinline)) void funcI(int pid) { printf("funcI %d\n", pid); } +__attribute__((noinline)) void funcJ(int pid) { printf("funcJ %d\n", pid); } +__attribute__((noinline)) void funcK(int pid) { printf("funcK %d\n", pid); } +__attribute__((noinline)) void funcL(int pid) { printf("funcL %d\n", pid); } +__attribute__((noinline)) void funcM(int pid) { printf("funcM %d\n", pid); } +__attribute__((noinline)) void funcN(int pid) { printf("funcN %d\n", pid); } +__attribute__((noinline)) void funcO(int pid) { printf("funcO %d\n", pid); } +__attribute__((noinline)) void funcP(int pid) { printf("funcP %d\n", pid); } + +int main() { + + void (*funcs[])(int) = {funcA, funcB, funcC, funcD, funcE, funcF, + funcG, funcH, funcI, funcJ, funcK, funcL, + funcM, funcN, funcO, funcP}; + int i; + + switch (fork()) { + case -1: + printf("Failed to fork!\n"); + exit(-1); + break; + case 0: + i = 0; + break; + default: + i = 1; + break; + } + int pid = getpid(); + for (; i < sizeof(funcs) / sizeof(void *); i += 2) { + funcs[i](pid); + } + + return 0; +} +/* +REQUIRES: system-linux + +RUN: %clang %cflags %s -o %t.exe -Wl,-q -no-pie + +RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \ +RUN: --conservative-instrumentation -o %t.instrumented_conservative \ +RUN: --instrumentation-sleep-time=1 --instrumentation-no-counters-clear \ +RUN: --instrumentation-wait-forks + +# Instrumented program needs to finish returning zero +# Both output and profile must contain all 16 functions +RUN: %t.instrumented_conservative > %t.output +# Wait for profile and output to be fully written +RUN: sleep 3 +RUN: cat %t.output | FileCheck %s --check-prefix=CHECK-OUTPUT +RUN: cat %t.fdata | FileCheck %s --check-prefix=CHECK-COMMON-PROF + +CHECK-OUTPUT-DAG: funcA +CHECK-OUTPUT-DAG: funcB +CHECK-OUTPUT-DAG: funcC +CHECK-OUTPUT-DAG: funcD +CHECK-OUTPUT-DAG: funcE +CHECK-OUTPUT-DAG: funcF +CHECK-OUTPUT-DAG: funcG +CHECK-OUTPUT-DAG: funcH +CHECK-OUTPUT-DAG: funcI +CHECK-OUTPUT-DAG: funcJ +CHECK-OUTPUT-DAG: funcK +CHECK-OUTPUT-DAG: funcL +CHECK-OUTPUT-DAG: funcM +CHECK-OUTPUT-DAG: funcN +CHECK-OUTPUT-DAG: funcO +CHECK-OUTPUT-DAG: funcP + +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcA 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcB 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcC 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcD 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcE 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcF 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcG 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcH 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcI 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcJ 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcK 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcL 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcM 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcN 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcO 0 0 1 +CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcP 0 0 1 + + +RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t \ +RUN: --instrumentation-file-append-pid \ +RUN: -o %t.instrumented + +RUN: %t.instrumented > %t.output +# Wait till output is fully written in case child outlives parent +RUN: sleep 1 +# Make sure all functions were called +RUN: cat %t.output | FileCheck %s --check-prefix=CHECK-OUTPUT + +RUN: child_pid=$(cat %t.output | grep funcA | awk '{print $2;}') +RUN: par_pid=$(cat %t.output | grep funcB | awk '{print $2;}') +RUN: mv %t.$child_pid.fdata %t.child.fdata +RUN: mv %t.$par_pid.fdata %t.parent.fdata + +# Instrumented binary must produce two profiles with only local calls +# recorded. Functions called only in child should not appear in parent's +# process and vice versa. +RUN: cat %t.child.fdata | FileCheck %s --check-prefix=CHECK-CHILD +RUN: cat %t.child.fdata | FileCheck %s --check-prefix=CHECK-NOCHILD +RUN: cat %t.parent.fdata | FileCheck %s --check-prefix=CHECK-PARENT +RUN: cat %t.parent.fdata | FileCheck %s --check-prefix=CHECK-NOPARENT + +CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcA 0 0 1 +CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcC 0 0 1 +CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcE 0 0 1 +CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcG 0 0 1 +CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcI 0 0 1 +CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcK 0 0 1 +CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcM 0 0 1 +CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcO 0 0 1 + +CHECK-NOCHILD-NOT: funcB +CHECK-NOCHILD-NOT: funcD +CHECK-NOCHILD-NOT: funcF +CHECK-NOCHILD-NOT: funcH +CHECK-NOCHILD-NOT: funcJ +CHECK-NOCHILD-NOT: funcL +CHECK-NOCHILD-NOT: funcN +CHECK-NOCHILD-NOT: funcP + +CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcB 0 0 1 +CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcD 0 0 1 +CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcF 0 0 1 +CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcH 0 0 1 +CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcJ 0 0 1 +CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcL 0 0 1 +CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcN 0 0 1 +CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcP 0 0 1 + +CHECK-NOPARENT-NOT: funcA +CHECK-NOPARENT-NOT: funcC +CHECK-NOPARENT-NOT: funcE +CHECK-NOPARENT-NOT: funcG +CHECK-NOPARENT-NOT: funcI +CHECK-NOPARENT-NOT: funcK +CHECK-NOPARENT-NOT: funcM +CHECK-NOPARENT-NOT: funcO + + */