diff --git a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
--- a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
+++ b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
@@ -63,6 +63,16 @@
               "DT_FINI to write the profile\n";
     exit(1);
   }
+
+  if ((opts::InstrumentationWaitForks || opts::InstrumentationSleepTime) &&
+      opts::InstrumentationFileAppendPID) {
+    errs()
+        << "BOLT-ERROR: instrumentation-file-append-pid is not compatible with "
+           "instrumentation-sleep-time and instrumentation-wait-forks. If you "
+           "want a separate profile for each fork, it can only be dumped in "
+           "the end of process when instrumentation-file-append-pid is used.\n";
+    exit(1);
+  }
 }
 
 void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC,
diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h
--- a/bolt/runtime/common.h
+++ b/bolt/runtime/common.h
@@ -246,6 +246,21 @@
   return ret;
 }
 
+uint64_t __getpid() {
+  uint64_t ret;
+#if defined(__APPLE__)
+#define GETPID_SYSCALL 20
+#else
+#define GETPID_SYSCALL 39
+#endif
+  __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n"
+                                                          "syscall\n"
+                       : "=a"(ret)
+                       :
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
 uint64_t __exit(uint64_t code) {
 #if defined(__APPLE__)
 #define EXIT_SYSCALL 0x2000001
@@ -509,16 +524,6 @@
   return ret;
 }
 
-uint64_t __getpid() {
-  uint64_t ret;
-  __asm__ __volatile__("movq $39, %%rax\n"
-                       "syscall\n"
-                       : "=a"(ret)
-                       :
-                       : "cc", "rcx", "r11", "memory");
-  return ret;
-}
-
 uint64_t __getppid() {
   uint64_t ret;
   __asm__ __volatile__("movq $110, %%rax\n"
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -208,7 +208,12 @@
 
 /// Used for allocating indirect call instrumentation counters. Initialized by
 /// __bolt_instr_setup, our initialization routine.
-BumpPtrAllocator GlobalAlloc;
+BumpPtrAllocator *GlobalAlloc;
+
+// Storage for GlobalAlloc which can be shared if not using
+// instrumentation-file-append-pid.
+void *GlobalMetadataStorage;
+
 } // anonymous namespace
 
 // User-defined placement new operators. We only use those (as opposed to
@@ -228,6 +233,10 @@
   memset(Ptr, C, Sz);
   return Ptr;
 }
+
+// Declaration for global allocator to construct it in shared memory if needed.
+// Needed because we can't #include <new>
+void *operator new(size_t, void *) noexcept;
 // Only called during exception unwinding (useless). We must manually dealloc.
 // C++ language weirdness
 void operator delete(void *Ptr, BumpPtrAllocator &A) { A.deallocate(Ptr); }
@@ -241,6 +250,37 @@
 struct SimpleHashTableEntryBase {
   uint64_t Key;
   uint64_t Val;
+  void dump(const char *Msg = nullptr) {
+    // TODO: make some sort of formatting function
+    // Currently we have to do it the ugly way because
+    // we want every message to be printed atomically via a single call to
+    // __write. If we use reportNumber() and others nultiple times, we'll get
+    // garbage in mulithreaded environment
+    char Buf[BufSize];
+    char *Ptr = Buf;
+    Ptr = intToStr(Ptr, __getpid(), 10);
+    *Ptr++ = ':';
+    *Ptr++ = ' ';
+    if (Msg)
+      Ptr = strCopy(Ptr, Msg, strLen(Msg));
+    *Ptr++ = '0';
+    *Ptr++ = 'x';
+    Ptr = intToStr(Ptr, (uint64_t)this, 16);
+    *Ptr++ = ':';
+    *Ptr++ = ' ';
+    Ptr = strCopy(Ptr, "MapEntry(0x", sizeof("MapEntry(0x") - 1);
+    Ptr = intToStr(Ptr, Key, 16);
+    *Ptr++ = ',';
+    *Ptr++ = ' ';
+    *Ptr++ = '0';
+    *Ptr++ = 'x';
+    Ptr = intToStr(Ptr, Val, 16);
+    *Ptr++ = ')';
+    *Ptr++ = '\n';
+    assert(Ptr - Buf < BufSize, "Buffer overflow!");
+    // print everything all at once for atomicity
+    __write(2, Buf, Ptr - Buf);
+  }
 };
 
 /// This hash table implementation starts by allocating a table of size
@@ -262,7 +302,17 @@
   /// Increment by 1 the value of \p Key. If it is not in this table, it will be
   /// added to the table and its value set to 1.
   void incrementVal(uint64_t Key, BumpPtrAllocator &Alloc) {
-    ++get(Key, Alloc).Val;
+    if (!__bolt_instr_conservative) {
+      TryLock L(M);
+      if (!L.isLocked())
+        return;
+      auto &E = getOrAllocEntry(Key, Alloc);
+      ++E.Val;
+      return;
+    }
+    Lock L(M);
+    auto &E = getOrAllocEntry(Key, Alloc);
+    ++E.Val;
   }
 
   /// Basic member accessing interface. Here we pass the allocator explicitly to
@@ -306,10 +356,10 @@
       if (Entry.Key == VacantMarker)
         continue;
       if (Entry.Key & FollowUpTableMarker) {
-        forEachElement(Callback, IncSize,
-                       reinterpret_cast<MapEntry *>(Entry.Key &
-                                                    ~FollowUpTableMarker),
-                       args...);
+        MapEntry *Next =
+            reinterpret_cast<MapEntry *>(Entry.Key & ~FollowUpTableMarker);
+        assert(Next != Entries, "Circular reference!");
+        forEachElement(Callback, IncSize, Next, args...);
         continue;
       }
       Callback(Entry, args...);
@@ -320,11 +370,13 @@
     TableRoot = new (Alloc, 0) MapEntry[InitialSize];
     MapEntry &Entry = TableRoot[Key % InitialSize];
     Entry.Key = Key;
+    // DEBUG(Entry.dump("Created root entry: "));
     return Entry;
   }
 
   MapEntry &getEntry(MapEntry *Entries, uint64_t Key, uint64_t Selector,
                      BumpPtrAllocator &Alloc, int CurLevel) {
+    // DEBUG(reportNumber("getEntry called, level ", CurLevel, 10));
     const uint32_t NumEntries = CurLevel == 0 ? InitialSize : IncSize;
     uint64_t Remainder = Selector / NumEntries;
     Selector = Selector % NumEntries;
@@ -332,12 +384,14 @@
 
     // A hit
     if (Entry.Key == Key) {
+      // DEBUG(Entry.dump("Hit: "));
       return Entry;
     }
 
     // Vacant - add new entry
     if (Entry.Key == VacantMarker) {
       Entry.Key = Key;
+      // DEBUG(Entry.dump("Adding new entry: "));
       return Entry;
     }
 
@@ -349,19 +403,32 @@
     }
 
     // Conflict - create the next level
+    // DEBUG(Entry.dump("Creating new level: "));
+
     MapEntry *NextLevelTbl = new (Alloc, 0) MapEntry[IncSize];
+    // DEBUG(
+    //     reportNumber("Newly allocated level: 0x", uint64_t(NextLevelTbl),
+    //     16));
     uint64_t CurEntrySelector = Entry.Key / InitialSize;
     for (int I = 0; I < CurLevel; ++I)
       CurEntrySelector /= IncSize;
     CurEntrySelector = CurEntrySelector % IncSize;
     NextLevelTbl[CurEntrySelector] = Entry;
     Entry.Key = reinterpret_cast<uint64_t>(NextLevelTbl) | FollowUpTableMarker;
+    assert((NextLevelTbl[CurEntrySelector].Key & ~FollowUpTableMarker) !=
+               uint64_t(Entries),
+           "circular reference created!\n");
+    // DEBUG(NextLevelTbl[CurEntrySelector].dump("New level entry: "));
+    // DEBUG(Entry.dump("Updated old entry: "));
     return getEntry(NextLevelTbl, Key, Remainder, Alloc, CurLevel + 1);
   }
 
   MapEntry &getOrAllocEntry(uint64_t Key, BumpPtrAllocator &Alloc) {
-    if (TableRoot)
-      return getEntry(TableRoot, Key, Key, Alloc, 0);
+    if (TableRoot) {
+      MapEntry &E = getEntry(TableRoot, Key, Key, Alloc, 0);
+      assert(!(E.Key & FollowUpTableMarker), "Invalid entry!");
+      return E;
+    }
     return firstAllocation(Key, Alloc);
   }
 };
@@ -1538,6 +1605,9 @@
 
 /// Initialization code
 extern "C" void __attribute((force_align_arg_pointer)) __bolt_instr_setup() {
+  __bolt_ind_call_counter_func_pointer = __bolt_instr_indirect_call;
+  __bolt_ind_tailcall_counter_func_pointer = __bolt_instr_indirect_tailcall;
+
   const uint64_t CountersStart =
       reinterpret_cast<uint64_t>(&__bolt_instr_locations[0]);
   const uint64_t CountersEnd = alignTo(
@@ -1546,21 +1616,28 @@
   DEBUG(reportNumber("replace mmap start: ", CountersStart, 16));
   DEBUG(reportNumber("replace mmap stop: ", CountersEnd, 16));
   assert (CountersEnd > CountersStart, "no counters");
-  // Maps our counters to be shared instead of private, so we keep counting for
-  // forked processes
+
+  const bool Shared = !__bolt_instr_use_pid;
+  const uint64_t MapPrivateOrShared = Shared ? MAP_SHARED : MAP_PRIVATE;
+
   void *Ret =
       __mmap(CountersStart, CountersEnd - CountersStart, PROT_READ | PROT_WRITE,
-             MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED, -1, 0);
+             MAP_ANONYMOUS | MapPrivateOrShared | MAP_FIXED, -1, 0);
   assert(Ret != MAP_FAILED, "Failed to mmap counters!");
-  __bolt_ind_call_counter_func_pointer = __bolt_instr_indirect_call;
-  __bolt_ind_tailcall_counter_func_pointer = __bolt_instr_indirect_tailcall;
-  // Conservatively reserve 100MiB shared pages
-  GlobalAlloc.setMaxSize(0x6400000);
-  GlobalAlloc.setShared(true);
-  GlobalWriteProfileMutex = new (GlobalAlloc, 0) Mutex();
+
+  GlobalMetadataStorage = __mmap(0, 4096, PROT_READ | PROT_WRITE,
+                                 MapPrivateOrShared | MAP_ANONYMOUS, -1, 0);
+  assert(GlobalMetadataStorage != MAP_FAILED,
+         "failed to mmap page for metadata!");
+
+  GlobalAlloc = new (GlobalMetadataStorage) BumpPtrAllocator;
+  // Conservatively reserve 100MiB
+  GlobalAlloc->setMaxSize(0x6400000);
+  GlobalAlloc->setShared(Shared);
+  GlobalWriteProfileMutex = new (*GlobalAlloc, 0) Mutex();
   if (__bolt_instr_num_ind_calls > 0)
     GlobalIndCallCounters =
-        new (GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls];
+        new (*GlobalAlloc, 0) IndirectCallHashTable[__bolt_instr_num_ind_calls];
 
   if (__bolt_instr_sleep_time != 0) {
     // Separate instrumented process to the own process group
@@ -1575,7 +1652,7 @@
 
 extern "C" __attribute((force_align_arg_pointer)) void
 instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) {
-  GlobalIndCallCounters[IndCallID].incrementVal(Target, GlobalAlloc);
+  GlobalIndCallCounters[IndCallID].incrementVal(Target, *GlobalAlloc);
 }
 
 /// We receive as in-stack arguments the identifier of the indirect call site
diff --git a/bolt/test/runtime/instrumentation-indirect-2.c b/bolt/test/runtime/instrumentation-indirect-2.c
new file mode 100644
--- /dev/null
+++ b/bolt/test/runtime/instrumentation-indirect-2.c
@@ -0,0 +1,164 @@
+// Check that indirect call hash tables properly register multiple calls,
+// and that calls from different processes don't get mixed up when using
+// --instrumentation-file-append-pid.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+__attribute__((noinline)) void funcA(int pid) { printf("funcA %d\n", pid); }
+__attribute__((noinline)) void funcB(int pid) { printf("funcB %d\n", pid); }
+__attribute__((noinline)) void funcC(int pid) { printf("funcC %d\n", pid); }
+__attribute__((noinline)) void funcD(int pid) { printf("funcD %d\n", pid); }
+__attribute__((noinline)) void funcE(int pid) { printf("funcE %d\n", pid); }
+__attribute__((noinline)) void funcF(int pid) { printf("funcF %d\n", pid); }
+__attribute__((noinline)) void funcG(int pid) { printf("funcG %d\n", pid); }
+__attribute__((noinline)) void funcH(int pid) { printf("funcH %d\n", pid); }
+__attribute__((noinline)) void funcI(int pid) { printf("funcI %d\n", pid); }
+__attribute__((noinline)) void funcJ(int pid) { printf("funcJ %d\n", pid); }
+__attribute__((noinline)) void funcK(int pid) { printf("funcK %d\n", pid); }
+__attribute__((noinline)) void funcL(int pid) { printf("funcL %d\n", pid); }
+__attribute__((noinline)) void funcM(int pid) { printf("funcM %d\n", pid); }
+__attribute__((noinline)) void funcN(int pid) { printf("funcN %d\n", pid); }
+__attribute__((noinline)) void funcO(int pid) { printf("funcO %d\n", pid); }
+__attribute__((noinline)) void funcP(int pid) { printf("funcP %d\n", pid); }
+
+int main() {
+
+  void (*funcs[])(int) = {funcA, funcB, funcC, funcD, funcE, funcF,
+                          funcG, funcH, funcI, funcJ, funcK, funcL,
+                          funcM, funcN, funcO, funcP};
+  int i;
+
+  switch (fork()) {
+  case -1:
+    printf("Failed to fork!\n");
+    exit(-1);
+    break;
+  case 0:
+    i = 0;
+    break;
+  default:
+    i = 1;
+    break;
+  }
+  int pid = getpid();
+  for (; i < sizeof(funcs) / sizeof(void *); i += 2) {
+    funcs[i](pid);
+  }
+
+  return 0;
+}
+/*
+REQUIRES: system-linux
+
+RUN: %clang %cflags %s -o %t.exe -Wl,-q -no-pie
+
+RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \
+RUN:   --conservative-instrumentation -o %t.instrumented_conservative \
+RUN: --instrumentation-sleep-time=1 --instrumentation-no-counters-clear \
+RUN: --instrumentation-wait-forks
+
+# Instrumented program needs to finish returning zero
+# Both output and profile must contain all 16 functions
+RUN: %t.instrumented_conservative > %t.output
+# Wait for profile and output to be fully written
+RUN: sleep 3
+RUN: cat %t.output | FileCheck %s --check-prefix=CHECK-OUTPUT
+RUN: cat %t.fdata | FileCheck %s --check-prefix=CHECK-COMMON-PROF
+
+CHECK-OUTPUT-DAG: funcA
+CHECK-OUTPUT-DAG: funcB
+CHECK-OUTPUT-DAG: funcC
+CHECK-OUTPUT-DAG: funcD
+CHECK-OUTPUT-DAG: funcE
+CHECK-OUTPUT-DAG: funcF
+CHECK-OUTPUT-DAG: funcG
+CHECK-OUTPUT-DAG: funcH
+CHECK-OUTPUT-DAG: funcI
+CHECK-OUTPUT-DAG: funcJ
+CHECK-OUTPUT-DAG: funcK
+CHECK-OUTPUT-DAG: funcL
+CHECK-OUTPUT-DAG: funcM
+CHECK-OUTPUT-DAG: funcN
+CHECK-OUTPUT-DAG: funcO
+CHECK-OUTPUT-DAG: funcP
+
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcA 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcB 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcC 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcD 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcE 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcF 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcG 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcH 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcI 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcJ 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcK 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcL 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcM 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcN 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcO 0 0 1
+CHECK-COMMON-PROF-DAG: 1 main {{[0-9a-f]+}} 1 funcP 0 0 1
+
+
+RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t \
+RUN:   --instrumentation-file-append-pid \
+RUN:   -o %t.instrumented
+
+RUN: %t.instrumented > %t.output
+# Wait till output is fully written in case child outlives parent
+RUN: sleep 1
+# Make sure all functions were called
+RUN: cat %t.output | FileCheck %s --check-prefix=CHECK-OUTPUT
+
+RUN: child_pid=$(cat %t.output | grep funcA | awk '{print $2;}')
+RUN: par_pid=$(cat %t.output | grep funcB | awk '{print $2;}')
+RUN: mv %t.$child_pid.fdata %t.child.fdata
+RUN: mv %t.$par_pid.fdata %t.parent.fdata
+
+# Instrumented binary must produce two profiles with only local calls
+# recorded. Functions called only in child should not appear in parent's
+# process and vice versa.
+RUN: cat %t.child.fdata | FileCheck %s --check-prefix=CHECK-CHILD
+RUN: cat %t.child.fdata | FileCheck %s --check-prefix=CHECK-NOCHILD
+RUN: cat %t.parent.fdata | FileCheck %s --check-prefix=CHECK-PARENT
+RUN: cat %t.parent.fdata | FileCheck %s --check-prefix=CHECK-NOPARENT
+
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcA 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcC 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcE 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcG 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcI 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcK 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcM 0 0 1
+CHECK-CHILD-DAG: 1 main {{[0-9a-f]+}} 1 funcO 0 0 1
+
+CHECK-NOCHILD-NOT: funcB
+CHECK-NOCHILD-NOT: funcD
+CHECK-NOCHILD-NOT: funcF
+CHECK-NOCHILD-NOT: funcH
+CHECK-NOCHILD-NOT: funcJ
+CHECK-NOCHILD-NOT: funcL
+CHECK-NOCHILD-NOT: funcN
+CHECK-NOCHILD-NOT: funcP
+
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcB 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcD 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcF 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcH 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcJ 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcL 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcN 0 0 1
+CHECK-PARENT-DAG: 1 main {{[0-9a-f]+}} 1 funcP 0 0 1
+
+CHECK-NOPARENT-NOT: funcA
+CHECK-NOPARENT-NOT: funcC
+CHECK-NOPARENT-NOT: funcE
+CHECK-NOPARENT-NOT: funcG
+CHECK-NOPARENT-NOT: funcI
+CHECK-NOPARENT-NOT: funcK
+CHECK-NOPARENT-NOT: funcM
+CHECK-NOPARENT-NOT: funcO
+
+ */