diff --git a/llvm/examples/CMakeLists.txt b/llvm/examples/CMakeLists.txt
--- a/llvm/examples/CMakeLists.txt
+++ b/llvm/examples/CMakeLists.txt
@@ -8,6 +8,7 @@
 add_subdirectory(ModuleMaker)
 add_subdirectory(SpeculativeJIT)
 add_subdirectory(Bye)
+add_subdirectory(ThinLtoJIT)
 
 if(LLVM_ENABLE_EH AND (NOT WIN32) AND (NOT "${LLVM_NATIVE_ARCH}" STREQUAL "ARM"))
     add_subdirectory(ExceptionDemo)
diff --git a/llvm/examples/ThinLtoJIT/CMakeLists.txt b/llvm/examples/ThinLtoJIT/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  IRReader
+  OrcJIT
+  ExecutionEngine
+  Support
+  nativecodegen
+  Analysis
+  Passes
+  )
+
+add_llvm_example(ThinLtoJIT
+  main.cpp
+  ThinLtoJIT.cpp
+  ThinLtoModuleIndex.cpp
+  ThinLtoInstrumentationLayer.cpp
+  ThinLtoDiscoveryThread.cpp
+  )
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h
@@ -0,0 +1,52 @@
+#ifndef LLVM_EXAMPLES_THINLTOJIT_DISCOVERYTHREAD_H
+#define LLVM_EXAMPLES_THINLTOJIT_DISCOVERYTHREAD_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+
+#include "ThinLtoJIT.h"
+
+#include <atomic>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+class ExecutionSession;
+class ThinLtoModuleIndex;
+class ThinLtoInstrumentationLayer;
+
+class ThinLtoDiscoveryThread {
+public:
+  ThinLtoDiscoveryThread(std::atomic<bool> &RunningFlag, ExecutionSession &ES,
+                         JITDylib *MainJD, ThinLtoInstrumentationLayer &L,
+                         ThinLtoModuleIndex &GlobalIndex,
+                         unsigned LookaheadLevels, unsigned NumLoadThreads,
+                         MangleAndInterner &Mangle)
+      : KeepRunning(RunningFlag), ES(ES), Layer(L), GlobalIndex(GlobalIndex),
+        SearchOrder(makeJITDylibSearchOrder({MainJD})),
+        LookaheadLevels(LookaheadLevels), NumLoadThreads(NumLoadThreads),
+        Mangle(Mangle) {}
+
+  void operator()();
+
+  static constexpr const char *getModuleRequestPrefix() { return "file://"; }
+
+private:
+  std::atomic<bool> &KeepRunning;
+  ExecutionSession &ES;
+  ThinLtoInstrumentationLayer &Layer;
+  ThinLtoModuleIndex &GlobalIndex;
+  JITDylibSearchOrder SearchOrder;
+  unsigned LookaheadLevels;
+  unsigned NumLoadThreads;
+  MangleAndInterner &Mangle;
+
+  void spawnLookupForHighRankModules();
+  SymbolStringPtr makeDummySymbolName(const std::string &Path) const;
+};
+
+} // namespace orc
+} // namespace llvm
+
+#endif
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp
@@ -0,0 +1,88 @@
+#include "ThinLtoDiscoveryThread.h"
+
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+
+#include "ThinLtoInstrumentationLayer.h"
+#include "ThinLtoModuleIndex.h"
+
+#include <thread>
+
+#define DEBUG_TYPE "thinltojit"
+
+namespace llvm {
+namespace orc {
+
+void ThinLtoDiscoveryThread::operator()() {
+  while (KeepRunning.load()) {
+    std::vector<unsigned> Indexes = Layer.takeFlagsThatFired();
+
+    if (!Indexes.empty()) {
+      LLVM_DEBUG(dbgs() << Indexes.size() << " new flags raised\n");
+      auto ReachedFunctions = Layer.takeFlagOwners(std::move(Indexes));
+
+      for (GlobalValue::GUID F : ReachedFunctions) {
+        if (GlobalValueSummary *S = GlobalIndex.getSummary(F)) {
+          assert(isa<FunctionSummary>(S) && "Reached symbols are functions");
+          GlobalIndex.discoverCalleeModulePaths(cast<FunctionSummary>(S),
+                                                LookaheadLevels);
+        } else {
+          LLVM_DEBUG(dbgs() << "No summary for GUID: " << F << "\n");
+        }
+      }
+
+      if (GlobalIndex.getNumDiscoveredModules() > 0)
+        spawnLookupForHighRankModules();
+    }
+  }
+}
+
+void ThinLtoDiscoveryThread::spawnLookupForHighRankModules() {
+  SymbolLookupSet SymbolSet;
+
+  // TODO: The size of a ThreadPool's task queue is not accessible. It would
+  // be great to know in order to estimate how many modules we schedule. The
+  // more we schedule, the less precise is the ranking. The less we schedule,
+  // the higher the risk for downtime. For now let's take two times the number
+  // of threads.
+  for (std::string Path : GlobalIndex.getNextPaths(2 * NumLoadThreads)) {
+    // Kick-off parsing as soon as possible.
+    if (Error ScheduleErr = GlobalIndex.scheduleModuleParsing(Path)) {
+      ES.reportError(std::move(ScheduleErr));
+      continue;
+    }
+
+    // We cannot add modules concurrently, because there might be a request
+    // in flight which looks for its symbols. Instead we issue a new lookup
+    // request and let it run into our definition generator. From there we
+    // can safely add the module. However, as we didn't parse the bitcode
+    // yet and ThinLTO summaries only provide hashed function names, we
+    // don't know which symbols to request in the lookup. Thus, we make a
+    // dummy request for a symbol that encodes the module path. The
+    // definition generator detects them and adds the module. As we won't
+    // actually resolve the dummy symbol, we declare it weak.
+    SymbolSet.add(makeDummySymbolName(Path),
+                  SymbolLookupFlags::WeaklyReferencedSymbol);
+  }
+
+  // The lookup request will wait for the session lock.
+  auto LookupWorker = [this, SymbolSet = std::move(SymbolSet)]() {
+    auto SymbolMap = ES.lookup(SearchOrder, SymbolSet);
+    if (!SymbolMap)
+      ES.reportError(SymbolMap.takeError());
+
+    assert(SymbolMap->empty() && "Dummy symbols are not defined");
+  };
+
+  std::thread(std::move(LookupWorker)).detach();
+}
+
+SymbolStringPtr
+ThinLtoDiscoveryThread::makeDummySymbolName(const std::string &Path) const {
+  std::string DummySymbolName = getModuleRequestPrefix() + Path;
+  return Mangle(DummySymbolName);
+}
+
+} // namespace orc
+} // namespace llvm
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h
@@ -0,0 +1,78 @@
+#ifndef LLVM_EXAMPLES_THINLTOJIT_DISCOVERYLAYER_H
+#define LLVM_EXAMPLES_THINLTOJIT_DISCOVERYLAYER_H
+
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/Layer.h"
+#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+
+#include <atomic>
+#include <cstdint>
+#include <map>
+#include <mutex>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+class ThinLtoInstrumentationLayer : public IRLayer {
+public:
+  enum ExplicitMemoryBarrier {
+    Never = 0,
+    StaticCode = 1,
+    JITedCode = 2,
+    Always = 3
+  };
+
+  ThinLtoInstrumentationLayer(ExecutionSession &ES, IRCompileLayer &BaseLayer,
+                              ExplicitMemoryBarrier InsertMemBarrier,
+                              unsigned FlagsPerBucket)
+      : IRLayer(ES), BaseLayer(BaseLayer), InsertMemBarrier(InsertMemBarrier) {
+    // TODO: So far we only allocate one bucket.
+    allocateDiscoveryFlags(FlagsPerBucket);
+  }
+
+  ~ThinLtoInstrumentationLayer() override;
+
+  void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
+
+  unsigned reserveDiscoveryFlags(unsigned Count);
+  void registerDiscoveryFlagOwners(std::vector<GlobalValue::GUID> Guids,
+                                   unsigned FirstIdx);
+
+  void nudgeIntoDiscovery(std::vector<GlobalValue::GUID> Functions);
+
+  std::vector<unsigned> takeFlagsThatFired();
+  std::vector<GlobalValue::GUID> takeFlagOwners(std::vector<unsigned> Indexes);
+
+private:
+  IRCompileLayer &BaseLayer;
+  ExplicitMemoryBarrier InsertMemBarrier;
+
+  enum Flag : uint8_t { Clear = 0, Fired = 1 };
+
+  // Lock-free read access.
+  uint8_t *FlagsStorage;
+  Flag *FlagsIncoming; // lock-free write by design
+  Flag *FlagsHandled;
+  unsigned NumFlagsAllocated;
+  std::atomic<unsigned> NumFlagsUsed; // spin-lock
+
+  // Acquire/release sync between writers and reader
+  std::atomic<uint64_t> FlagsSync;
+
+  // STL container requires locking for both, read and write access.
+  mutable std::mutex DiscoveryFlagsInfoLock;
+  std::map<unsigned, GlobalValue::GUID> FlagOwnersMap;
+
+  void allocateDiscoveryFlags(unsigned MinFlags);
+  void compileFunctionReachedFlagSetter(IRBuilder<> &B, Flag *F);
+};
+
+} // namespace orc
+} // namespace llvm
+
+#endif
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp
@@ -0,0 +1,230 @@
+#include "ThinLtoInstrumentationLayer.h"
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Process.h"
+
+#include <cstdlib>
+
+#define DEBUG_TYPE "thinltojit"
+
+namespace llvm {
+namespace orc {
+
+// TODO: Fixed set of flags may not always be enough. Make this expandable.
+void ThinLtoInstrumentationLayer::allocateDiscoveryFlags(unsigned MinFlags) {
+  // Round up to full memory pages.
+  unsigned PageSize = sys::Process::getPageSizeEstimate();
+  unsigned NumPagesEach = (MinFlags + (PageSize - 1)) / PageSize;
+  unsigned NumPagesTotal = 2 * NumPagesEach;
+  assert(isPowerOf2_64(PageSize) && "Adjust aligned memory alloc below");
+
+  // Allocate one more page to make up for size loss due to alignment.
+  void *Storage = std::calloc(NumPagesTotal + 1, PageSize);
+  uint64_t StorageAddr = reinterpret_cast<uint64_t>(Storage);
+  uint64_t PageSizeDecr = PageSize - 1;
+  uint64_t AlignedAddr = ((StorageAddr + PageSizeDecr) & ~PageSizeDecr);
+  uint64_t Diff = AlignedAddr - StorageAddr;
+
+  // For each flag we allocate one byte in each location: Incoming and Handled.
+  // TODO: 'Handled' could be a bitset, but size must be dynamic
+  NumFlagsUsed.store(0);
+  NumFlagsAllocated = NumPagesEach * PageSize;
+  FlagsStorage = static_cast<uint8_t *>(Storage);
+  FlagsIncoming = reinterpret_cast<Flag *>(FlagsStorage + Diff);
+  FlagsHandled = FlagsIncoming + NumFlagsAllocated;
+
+  static_assert(sizeof(FlagsIncoming[0]) == sizeof(uint8_t), "Flags are bytes");
+  assert(reinterpret_cast<uint64_t>(FlagsIncoming) % PageSize == 0);
+  assert(reinterpret_cast<uint64_t>(FlagsHandled) % PageSize == 0);
+  assert(NumFlagsAllocated >= MinFlags);
+}
+
+unsigned ThinLtoInstrumentationLayer::reserveDiscoveryFlags(unsigned Count) {
+  assert(Count > 0);
+  unsigned Before, After;
+  do {
+    Before = NumFlagsUsed.load();
+    After = Before + Count;
+  } while (!NumFlagsUsed.compare_exchange_weak(Before, After));
+
+#ifndef NDEBUG
+  for (unsigned i = Before; i < After; i++) {
+    assert(FlagsIncoming[i] == Clear);
+  }
+#endif
+
+  return Before; // First reserved index
+}
+
+void ThinLtoInstrumentationLayer::registerDiscoveryFlagOwners(
+    std::vector<GlobalValue::GUID> Guids, unsigned FirstIdx) {
+  unsigned Count = Guids.size();
+
+  std::lock_guard<std::mutex> Lock(DiscoveryFlagsInfoLock);
+  for (unsigned i = 0; i < Count; i++) {
+    assert(!FlagOwnersMap.count(FirstIdx + i) &&
+           "Flag should not have an owner at this point");
+    FlagOwnersMap[FirstIdx + i] = Guids[i];
+  }
+}
+
+std::vector<unsigned> ThinLtoInstrumentationLayer::takeFlagsThatFired() {
+  // This is only effective with the respective Release.
+  FlagsSync.load(std::memory_order_acquire);
+
+  std::vector<unsigned> Indexes;
+  unsigned NumIndexesUsed = NumFlagsUsed.load();
+  for (unsigned i = 0; i < NumIndexesUsed; i++) {
+    if (FlagsIncoming[i] == Fired && FlagsHandled[i] == Clear) {
+      FlagsHandled[i] = Fired;
+      Indexes.push_back(i);
+    }
+  }
+
+  return Indexes;
+}
+
+std::vector<GlobalValue::GUID>
+ThinLtoInstrumentationLayer::takeFlagOwners(std::vector<unsigned> Indexes) {
+  std::vector<GlobalValue::GUID> ReachedFunctions;
+  std::lock_guard<std::mutex> Lock(DiscoveryFlagsInfoLock);
+
+  for (unsigned i : Indexes) {
+    auto KV = FlagOwnersMap.find(i);
+    assert(KV != FlagOwnersMap.end());
+    ReachedFunctions.push_back(KV->second);
+    FlagOwnersMap.erase(KV);
+  }
+
+  return ReachedFunctions;
+}
+
+void ThinLtoInstrumentationLayer::nudgeIntoDiscovery(
+    std::vector<GlobalValue::GUID> Functions) {
+  unsigned Count = Functions.size();
+
+  // Registering synthetic flags in advance. We expect them to get processed
+  // before the respective functions get emitted. If not, the emit() function
+  unsigned FirstFlagIdx = reserveDiscoveryFlags(Functions.size());
+  registerDiscoveryFlagOwners(std::move(Functions), FirstFlagIdx);
+
+  // Initialize the flags as fired and force a cache sync, so discovery will
+  // pick them up as soon as possible.
+  for (unsigned i = FirstFlagIdx; i < FirstFlagIdx + Count; i++) {
+    FlagsIncoming[i] = Fired;
+  }
+  if (InsertMemBarrier & ExplicitMemoryBarrier::StaticCode) {
+    FlagsSync.store(0, std::memory_order_release);
+  }
+
+  LLVM_DEBUG(dbgs() << "Nudged " << Count << " new functions into discovery\n");
+}
+
+void ThinLtoInstrumentationLayer::emit(MaterializationResponsibility R,
+                                       ThreadSafeModule TSM) {
+  TSM.withModuleDo([this](Module &M) {
+    std::vector<Function *> FunctionsToInstrument;
+
+    // We may have discovered ahead of some functions already, but we still
+    // instrument them all. Their notifications steer the future direction of
+    // discovery.
+    for (Function &F : M.getFunctionList())
+      if (!F.isDeclaration())
+        FunctionsToInstrument.push_back(&F);
+
+    if (!FunctionsToInstrument.empty()) {
+      IRBuilder<> B(M.getContext());
+      std::vector<GlobalValue::GUID> NewDiscoveryRoots;
+
+      // Flags that fire must have owners registered. We will do it below and
+      // that's fine, because they can only be reached once the code is emitted.
+      unsigned FirstFlagIdx =
+          reserveDiscoveryFlags(FunctionsToInstrument.size());
+
+      unsigned NextFlagIdx = FirstFlagIdx;
+      for (Function *F : FunctionsToInstrument) {
+        // TODO: Emitting the write operation into an indirection stub would
+        // allow to skip it once we got the notification.
+        BasicBlock *E = &F->getEntryBlock();
+        B.SetInsertPoint(BasicBlock::Create(
+            M.getContext(), "NotifyFunctionReachedProlog", F, E));
+        compileFunctionReachedFlagSetter(B, FlagsIncoming + NextFlagIdx);
+        B.CreateBr(E);
+
+        std::string GlobalName = GlobalValue::getGlobalIdentifier(
+            F->getName(), F->getLinkage(), M.getSourceFileName());
+        NewDiscoveryRoots.push_back(GlobalValue::getGUID(GlobalName));
+        ++NextFlagIdx;
+      }
+
+      LLVM_DEBUG(dbgs() << "Instrumented " << NewDiscoveryRoots.size()
+                        << " new functions in module " << M.getName() << "\n");
+
+      // Submit owner info, so the DiscoveryThread can evaluate the flags.
+      registerDiscoveryFlagOwners(std::move(NewDiscoveryRoots), FirstFlagIdx);
+    }
+  });
+
+  BaseLayer.emit(std::move(R), std::move(TSM));
+}
+
+void ThinLtoInstrumentationLayer::compileFunctionReachedFlagSetter(
+    IRBuilder<> &B, Flag *F) {
+  assert(*F == Clear);
+  Type *Int64Ty = Type::getInt64Ty(B.getContext());
+
+  // Write one immediate 8bit value to a fixed location in memory.
+  auto FlagAddr = pointerToJITTargetAddress(F);
+  Type *FlagTy = Type::getInt8Ty(B.getContext());
+  B.CreateStore(ConstantInt::get(FlagTy, Fired),
+                B.CreateIntToPtr(ConstantInt::get(Int64Ty, FlagAddr),
+                                 FlagTy->getPointerTo()));
+
+  if (InsertMemBarrier & ExplicitMemoryBarrier::JITedCode) {
+    // Overwrite the sync value with Release ordering. The discovery thread
+    // reads it with Acquire ordering. The actual value doesn't matter.
+    static constexpr bool IsVolatile = true;
+    static constexpr Instruction *NoInsertBefore = nullptr;
+    auto SyncFlagAddr = pointerToJITTargetAddress(&FlagsSync);
+
+    B.Insert(
+        new StoreInst(ConstantInt::get(Int64Ty, 0),
+                      B.CreateIntToPtr(ConstantInt::get(Int64Ty, SyncFlagAddr),
+                                       Int64Ty->getPointerTo()),
+                      IsVolatile, MaybeAlign(64), AtomicOrdering::Release,
+                      SyncScope::System, NoInsertBefore));
+  }
+}
+
+ThinLtoInstrumentationLayer::~ThinLtoInstrumentationLayer() {
+  LLVM_DEBUG({
+    dbgs() << "Discovery flags stats\n";
+
+    unsigned NumFlagsFired = 0;
+    for (unsigned i = 0; i < NumFlagsAllocated; i++) {
+      if (FlagsIncoming[i] == Fired)
+        ++NumFlagsFired;
+    }
+    dbgs() << "Alloc:  " << format("%6.d", NumFlagsAllocated) << "\n";
+    dbgs() << "Issued: " << format("%6.d", NumFlagsUsed.load()) << "\n";
+    dbgs() << "Fired:  " << format("%6.d", NumFlagsFired) << "\n";
+
+    unsigned RemainingFlagOwners = 0;
+    for (const auto &_ : FlagOwnersMap) {
+      ++RemainingFlagOwners;
+      (void)_;
+    }
+    dbgs() << "\nFlagOwnersMap has " << RemainingFlagOwners
+           << " remaining entries.\n";
+  });
+
+  std::free(FlagsStorage);
+}
+
+} // namespace orc
+} // namespace llvm
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.h b/llvm/examples/ThinLtoJIT/ThinLtoJIT.h
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoJIT.h
@@ -0,0 +1,115 @@
+#ifndef LLVM_EXAMPLES_THINLTOJIT_THINLTOJIT_H
+#define LLVM_EXAMPLES_THINLTOJIT_THINLTOJIT_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
+#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ThreadPool.h"
+
+#include "ThinLtoInstrumentationLayer.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+class ThinLtoModuleIndex;
+class ThinLtoDiscoveryThread;
+
+class RTDyldObjectLinkingLayer;
+class IRCompileLayer;
+class CompileOnDemandLayer;
+
+class JITDylib;
+class MangleAndInterner;
+class LazyCallThroughManager;
+
+class ThinLtoJIT {
+public:
+  using AddModuleFunction = std::function<Error(ThreadSafeModule)>;
+
+  ThinLtoJIT(ArrayRef<std::string> ModuleFiles, StringRef MainFunctionName,
+             unsigned LookaheadLevels, unsigned NumCompileThreads,
+             unsigned NumLoadThreads, unsigned DiscoveryFlagsPerBucket,
+             ThinLtoInstrumentationLayer::ExplicitMemoryBarrier MemFence,
+             bool AllowNudgeIntoDiscovery, Error &Err);
+  ~ThinLtoJIT();
+
+  ThinLtoJIT(const ThinLtoJIT &) = delete;
+  ThinLtoJIT &operator=(const ThinLtoJIT &) = delete;
+  ThinLtoJIT(ThinLtoJIT &&) = delete;
+  ThinLtoJIT &operator=(ThinLtoJIT &&) = delete;
+
+  Expected<int> main(ArrayRef<std::string> Args) {
+    auto MainSym = ES.lookup({MainJD}, MainFunctionMangled);
+    if (!MainSym)
+      return MainSym.takeError();
+
+    using MainFn = int(int, char *[]);
+    auto Main = jitTargetAddressToFunction<MainFn *>(MainSym->getAddress());
+
+    return runAsMain(Main, Args, StringRef("ThinLtoJIT"));
+  }
+
+private:
+  ExecutionSession ES;
+  std::unique_ptr<DataLayout> DL;
+
+  // Local convenience class to allow late construction of the mangler while
+  // preserving the conventional Mangle(SymbolName) syntax.
+  struct MangleWrapper {
+    SymbolStringPtr operator()(StringRef S) { return Impl->operator()(S); }
+    char getGlobalPrefix() { return DL->getGlobalPrefix(); }
+    MangleAndInterner &getImpl() { return *Impl.get(); }
+    void init(ExecutionSession &ES, Module *M) {
+      DL = std::make_unique<DataLayout>(M);
+      Impl = std::make_unique<MangleAndInterner>(ES, *DL);
+    }
+    std::unique_ptr<MangleAndInterner> Impl{nullptr};
+    std::unique_ptr<DataLayout> DL{nullptr};
+  };
+
+  MangleWrapper Mangle;
+
+  JITDylib *MainJD;
+  SymbolStringPtr MainFunctionMangled;
+  std::unique_ptr<ThreadPool> CompileThreads;
+  std::unique_ptr<ThinLtoModuleIndex> GlobalIndex;
+
+  AddModuleFunction AddModule;
+  AddModuleFunction AddModuleAndLookup;
+  std::unique_ptr<RTDyldObjectLinkingLayer> ObjLinkingLayer;
+  std::unique_ptr<IRCompileLayer> CompileLayer;
+  std::unique_ptr<ThinLtoInstrumentationLayer> InstrumentationLayer;
+  std::unique_ptr<CompileOnDemandLayer> OnDemandLayer;
+
+  std::atomic<bool> JitRunning;
+  std::unique_ptr<ThinLtoDiscoveryThread> DiscoveryThreadWorker;
+  std::unique_ptr<LazyCallThroughManager> CallThroughManager;
+
+  Error
+  setupLayers(Triple TT, unsigned DiscoveryFlagsPerBucket,
+              ThinLtoInstrumentationLayer::ExplicitMemoryBarrier MemFence);
+  Error setupJITDylib(JITDylib *JD, bool AllowNudge);
+  Error setupDiscovery(JITDylib *MainJD, unsigned NumCompileThreads,
+                       unsigned NumLoadThreads, unsigned LookaheadLevels);
+  Expected<ThreadSafeModule> setupMainModule(StringRef MainFunction);
+
+  static void exitOnLazyCallThroughFailure() {
+    errs() << "Compilation failed. Aborting.\n";
+    exit(1);
+  }
+};
+
+} // namespace orc
+} // namespace llvm
+
+#endif
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp
@@ -0,0 +1,307 @@
+#include "ThinLtoJIT.h"
+
+#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/Support/Debug.h"
+
+#include "ThinLtoDiscoveryThread.h"
+#include "ThinLtoInstrumentationLayer.h"
+#include "ThinLtoModuleIndex.h"
+
+#include <set>
+#include <string>
+#include <thread>
+
+#ifndef NDEBUG
+#include <chrono>
+#endif
+
+#define DEBUG_TYPE "thinltojit"
+
+namespace llvm {
+namespace orc {
+
+class ThinLtoDefinitionGenerator : public JITDylib::DefinitionGenerator {
+public:
+  ThinLtoDefinitionGenerator(ThinLtoModuleIndex &GlobalIndex,
+                             ThinLtoInstrumentationLayer &InstrumentationLayer,
+                             ThinLtoJIT::AddModuleFunction AddModule,
+                             char ManglePrefix, const char *ModReqPrefix,
+                             bool AllowNudge)
+      : GlobalIndex(GlobalIndex), InstrumentationLayer(InstrumentationLayer),
+        AddModule(std::move(AddModule)), ManglePrefix(ManglePrefix),
+        ModuleRequestPrefix(ModReqPrefix),
+        ModuleRequestPrefixLen(std::strlen(ModuleRequestPrefix)),
+        AllowNudgeIntoDiscovery(AllowNudge) {}
+
+  Error tryToGenerate(LookupKind K, JITDylib &JD,
+                      JITDylibLookupFlags JDLookupFlags,
+                      const SymbolLookupSet &Symbols) override;
+
+private:
+  ThinLtoModuleIndex &GlobalIndex;
+  ThinLtoInstrumentationLayer &InstrumentationLayer;
+  ThinLtoJIT::AddModuleFunction AddModule;
+  char ManglePrefix;
+  const char *ModuleRequestPrefix;
+  unsigned ModuleRequestPrefixLen;
+  bool AllowNudgeIntoDiscovery;
+
+  // ThinLTO summaries encode unprefixed names.
+  StringRef stripGlobalManglePrefix(StringRef Symbol) const {
+    bool Strip = (ManglePrefix != '\0' && Symbol[0] == ManglePrefix);
+    return Strip ? StringRef(Symbol.data() + 1, Symbol.size() - 1) : Symbol;
+  }
+};
+
+Error ThinLtoDefinitionGenerator::tryToGenerate(
+    LookupKind K, JITDylib &JD, JITDylibLookupFlags JDLookupFlags,
+    const SymbolLookupSet &Symbols) {
+  std::set<StringRef> ModulePaths;
+  std::vector<GlobalValue::GUID> NewDiscoveryRoots;
+
+#ifndef NDEBUG
+  std::set<StringRef> ModuleRequests;
+#endif
+
+  for (const auto &KV : Symbols) {
+    StringRef UnmangledName = stripGlobalManglePrefix(*KV.first);
+
+    if (UnmangledName.startswith(ModuleRequestPrefix)) {
+      // Module request from the DiscoveryThread
+      StringRef Path = UnmangledName.substr(ModuleRequestPrefixLen);
+      ModulePaths.insert(Path);
+#ifndef NDEBUG
+      ModuleRequests.insert(Path);
+#endif
+    } else {
+      // Regular symbol name
+      auto Guid = GlobalValue::getGUID(UnmangledName);
+      if (GlobalValueSummary *S = GlobalIndex.getSummary(Guid)) {
+        // We could have discovered it ahead of time.
+        LLVM_DEBUG(dbgs() << "Failed to discover symbol: " << UnmangledName
+                          << "\n");
+        ModulePaths.insert(S->modulePath());
+        if (AllowNudgeIntoDiscovery && isa<FunctionSummary>(S)) {
+          NewDiscoveryRoots.push_back(Guid);
+        }
+      }
+    }
+  }
+
+  for (StringRef Path : ModulePaths) {
+    // When the DiscoveryThread requests a Module, it immediately schedules it
+    // for parsing. We hope we are done at this point, so the function will
+    // only do the confirmation.
+    if (Error ParseErr = GlobalIndex.scheduleModuleParsing(Path))
+      return ParseErr;
+
+    ThreadSafeModule TSM = GlobalIndex.takeModule(Path);
+    if (!TSM) {
+      assert(ModuleRequests.find(Path) != ModuleRequests.end() &&
+             "No regular symbol requests to existing modules");
+      continue;
+    }
+
+    if (Error LoadErr = AddModule(std::move(TSM)))
+      // Found a module but failed to add it.
+      return LoadErr;
+
+    LLVM_DEBUG(dbgs() << "Generator: added " << Path << "\n");
+  }
+
+  // Requested functions that we failed to discover ahead of time, are likely
+  // close to the execution front. We can anticipate to run into them as soon
+  // as execution continues and trigger their discovery flags already now. This
+  // behavior is enabled with the 'allow-nudge' option and implemented below.
+  // On the one hand, it may give us a head start in a moment where discovery
+  // was lacking behind. On the other hand, we may bet on the wrong horse and
+  // waste extra time speculating in the wrong direction.
+  if (!NewDiscoveryRoots.empty()) {
+    assert(AllowNudgeIntoDiscovery);
+    InstrumentationLayer.nudgeIntoDiscovery(std::move(NewDiscoveryRoots));
+  }
+
+  return Error::success();
+}
+
+ThinLtoJIT::ThinLtoJIT(
+    ArrayRef<std::string> ModuleFiles, StringRef MainFunctionName,
+    unsigned LookaheadLevels, unsigned NumCompileThreads,
+    unsigned NumLoadThreads, unsigned DiscoveryFlagsPerBucket,
+    ThinLtoInstrumentationLayer::ExplicitMemoryBarrier MemFence,
+    bool AllowNudgeIntoDiscovery, Error &Err) {
+  ErrorAsOutParameter ErrAsOutParam(&Err);
+
+  GlobalIndex = std::make_unique<ThinLtoModuleIndex>(ES, NumLoadThreads);
+  for (StringRef F : ModuleFiles) {
+    if (auto Err = GlobalIndex->add(F))
+      ES.reportError(std::move(Err));
+  }
+
+  auto TSM = setupMainModule(MainFunctionName);
+  if (!TSM) {
+    Err = TSM.takeError();
+    return;
+  }
+
+  ThreadSafeModule MainModule = std::move(*TSM);
+  Module *RawModule = MainModule.getModuleUnlocked();
+
+  // Now that we know the target data layout we can setup the mangler.
+  Mangle.init(ES, RawModule);
+  MainFunctionMangled = Mangle(MainFunctionName);
+
+  Err = setupLayers(Triple(RawModule->getTargetTriple()),
+                    DiscoveryFlagsPerBucket, MemFence);
+  if (Err)
+    return;
+
+  MainJD = &ES.createJITDylib("main");
+  Err = setupJITDylib(MainJD, AllowNudgeIntoDiscovery);
+  if (Err)
+    return;
+
+  Err = setupDiscovery(MainJD, NumCompileThreads, NumLoadThreads,
+                       LookaheadLevels);
+  if (Err)
+    return;
+
+  Err = AddModule(std::move(MainModule));
+  if (Err)
+    return;
+
+  if (AllowNudgeIntoDiscovery) {
+    auto MainFunctionGuid = GlobalValue::getGUID(MainFunctionName);
+    InstrumentationLayer->nudgeIntoDiscovery({MainFunctionGuid});
+  }
+
+#ifndef NDEBUG
+  // Uncomment to give the discovery thread some time do things.
+  // std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+#endif
+}
+
+Expected<ThreadSafeModule> ThinLtoJIT::setupMainModule(StringRef MainFunction) {
+  Optional<StringRef> M = GlobalIndex->getModulePathForSymbol(MainFunction);
+  if (!M) {
+    std::string Buffer;
+    raw_string_ostream OS(Buffer);
+    OS << "No ValueInfo for symbol '" << MainFunction;
+    OS << "' in provided modules: ";
+    for (StringRef P : GlobalIndex->getAllModulePaths())
+      OS << P << " ";
+    OS << "\n";
+    return createStringError(inconvertibleErrorCode(), OS.str());
+  }
+
+  return GlobalIndex->parseModuleFromFile(*M);
+}
+
+Error ThinLtoJIT::setupLayers(
+    Triple TT, unsigned DiscoveryFlagsPerBucket,
+    ThinLtoInstrumentationLayer::ExplicitMemoryBarrier MemFence) {
+  ObjLinkingLayer = std::make_unique<RTDyldObjectLinkingLayer>(
+      ES, []() { return std::make_unique<SectionMemoryManager>(); });
+  CompileLayer = std::make_unique<IRCompileLayer>(
+      ES, *ObjLinkingLayer, ConcurrentIRCompiler(JITTargetMachineBuilder(TT)));
+
+  InstrumentationLayer = std::make_unique<ThinLtoInstrumentationLayer>(
+      ES, *CompileLayer, MemFence, DiscoveryFlagsPerBucket);
+
+  auto ISMB = createLocalIndirectStubsManagerBuilder(TT);
+  auto LCTM = createLocalLazyCallThroughManager(
+      TT, ES, pointerToJITTargetAddress(exitOnLazyCallThroughFailure));
+  if (!LCTM)
+    return LCTM.takeError();
+
+  CallThroughManager = std::move(*LCTM);
+  OnDemandLayer = std::make_unique<CompileOnDemandLayer>(
+      ES, *InstrumentationLayer, *CallThroughManager, std::move(ISMB));
+  // Don't break up modules. Insert stubs on module boundaries.
+  OnDemandLayer->setPartitionFunction(CompileOnDemandLayer::compileWholeModule);
+
+  AddModule = [this](ThreadSafeModule TSM) -> Error {
+    assert(MainJD && "Setup MainJD JITDylib before calling");
+    StringRef Path = TSM.getModuleUnlocked()->getName();
+    VModuleKey Id = GlobalIndex->getModuleId(Path);
+    return OnDemandLayer->add(*MainJD, std::move(TSM), Id);
+  };
+
+  return Error::success();
+}
+
+static bool IsTrivialModule(MaterializationUnit *MU) {
+  StringRef ModuleName = MU->getName();
+  return ModuleName == "<Lazy Reexports>" || ModuleName == "<Reexports>" ||
+         ModuleName == "<Absolute Symbols>";
+}
+
+Error ThinLtoJIT::setupDiscovery(JITDylib *MainJD, unsigned NumCompileThreads,
+                                 unsigned NumLoadThreads,
+                                 unsigned LookaheadLevels) {
+  // Delegate compilation to the thread pool.
+  CompileThreads = std::make_unique<ThreadPool>(NumCompileThreads);
+  ES.setDispatchMaterialization(
+      [this](JITDylib &JD, std::shared_ptr<MaterializationUnit> MU) {
+        if (IsTrivialModule(MU.get())) {
+          // This should be quick. And we maybe save a few session locks?
+          MU->doMaterialize(JD);
+        } else {
+          CompileThreads->async([MU, &JD]() { MU->doMaterialize(JD); });
+        }
+      });
+
+#ifndef NDEBUG
+  // Uncomment to avoid discovering all at once when debugging small examples.
+  // LookaheadLevels = 1;
+#endif
+
+  // Spawn discovery thread and let it add newly discovered modules to the JIT.
+  JitRunning.store(true);
+  DiscoveryThreadWorker = std::make_unique<ThinLtoDiscoveryThread>(
+      JitRunning, ES, MainJD, *InstrumentationLayer, *GlobalIndex,
+      LookaheadLevels, NumLoadThreads, Mangle.getImpl());
+
+  std::thread(std::ref(*DiscoveryThreadWorker)).detach();
+  return Error::success();
+}
+
+Error ThinLtoJIT::setupJITDylib(JITDylib *JD, bool AllowNudge) {
+  // Register symbols for C++ static destructors.
+  LocalCXXRuntimeOverrides CXXRuntimeoverrides;
+  Error Err = CXXRuntimeoverrides.enable(*JD, *Mangle.Impl);
+  if (Err)
+    return Err;
+
+  // Lookup symbol names in the global ThinLTO module index first
+  char ManglePrefix = Mangle.getGlobalPrefix();
+  const char *ModReqPrefix = DiscoveryThreadWorker->getModuleRequestPrefix();
+  JD->addGenerator(std::make_unique<ThinLtoDefinitionGenerator>(
+      *GlobalIndex, *InstrumentationLayer, AddModule, ManglePrefix,
+      ModReqPrefix, AllowNudge));
+  // Then try lookup in the host process.
+  auto HostLookup =
+      DynamicLibrarySearchGenerator::GetForCurrentProcess(ManglePrefix);
+  if (!HostLookup)
+    return HostLookup.takeError();
+  JD->addGenerator(std::move(*HostLookup));
+
+  return Error::success();
+}
+
+ThinLtoJIT::~ThinLtoJIT() {
+  // Signal the DiscoveryThread to shut down.
+  JitRunning.store(false);
+  // Wait for potential compile actions to finish.
+  CompileThreads->wait();
+}
+
+} // namespace orc
+} // namespace llvm
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h
@@ -0,0 +1,82 @@
+#ifndef LLVM_EXAMPLES_THINLTOJIT_THINLTOJITMODULEINDEX_H
+#define LLVM_EXAMPLES_THINLTOJIT_THINLTOJITMODULEINDEX_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ThreadPool.h"
+
+#include <cstdint>
+#include <future>
+#include <mutex>
+#include <set>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+class SymbolStringPtr;
+
+class ThinLtoModuleIndex {
+  static constexpr bool HaveGVs = false;
+
+public:
+  ThinLtoModuleIndex(ExecutionSession &ES, unsigned ParseModuleThreads)
+      : ES(ES), CombinedSummaryIndex(HaveGVs),
+        ParseModuleWorkers(ParseModuleThreads) {}
+
+  Error add(StringRef ModulePath);
+  GlobalValueSummary *getSummary(GlobalValue::GUID Function) const;
+  std::vector<StringRef> getAllModulePaths() const;
+  Optional<StringRef> getModulePathForSymbol(StringRef Name) const;
+
+  Error scheduleModuleParsing(StringRef Path);
+  ThreadSafeModule takeModule(StringRef Path);
+
+  // Blocking module parsing. This is only used for the main module.
+  Expected<ThreadSafeModule> parseModuleFromFile(StringRef Path);
+
+  void discoverCalleeModulePaths(FunctionSummary *S, unsigned LookaheadLevels);
+  std::vector<StringRef> getNextPaths(unsigned Count);
+  unsigned getNumDiscoveredModules() const { return PathRank.size(); }
+
+  VModuleKey getModuleId(StringRef Path) const {
+    return CombinedSummaryIndex.getModuleId(Path);
+  }
+
+private:
+  ExecutionSession &ES;
+  ModuleSummaryIndex CombinedSummaryIndex;
+  uint64_t NextModuleId{0};
+
+  struct PathRankEntry {
+    uint32_t Count{0};
+    uint32_t MinDist{100};
+  };
+  StringMap<PathRankEntry> PathRank;
+
+  ThreadPool ParseModuleWorkers;
+
+  std::mutex ScheduledModulesLock;
+  StringMap<std::shared_future<void>> ScheduledModules;
+
+  std::mutex ParsedModulesLock;
+  StringMap<ThreadSafeModule> ParsedModules;
+
+  void updatePathRank(StringRef Path, unsigned Distance);
+  void addToWorklist(std::vector<FunctionSummary *> &List,
+                     ArrayRef<FunctionSummary::EdgeTy> Calls);
+
+  std::vector<StringRef> selectAllPaths();
+  std::vector<StringRef> selectHotPaths(unsigned Count);
+
+  Expected<ThreadSafeModule> doParseModule(StringRef Path);
+};
+
+} // namespace orc
+} // namespace llvm
+
+#endif
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp
@@ -0,0 +1,270 @@
+#include "ThinLtoModuleIndex.h"
+
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <memory>
+#include <string>
+
+#define DEBUG_TYPE "thinltojit"
+
+namespace llvm {
+namespace orc {
+
+Error ThinLtoModuleIndex::add(StringRef ModulePath) {
+  auto Buffer = errorOrToExpected(MemoryBuffer::getFile(ModulePath));
+  if (!Buffer)
+    return Buffer.takeError();
+
+  Error ParseErr = readModuleSummaryIndex((*Buffer)->getMemBufferRef(),
+                                          CombinedSummaryIndex, NextModuleId);
+  if (ParseErr)
+    return ParseErr;
+
+  ++NextModuleId;
+  return Error::success();
+}
+
+std::vector<StringRef> ThinLtoModuleIndex::getAllModulePaths() const {
+  auto ModuleTable = CombinedSummaryIndex.modulePaths();
+  assert(ModuleTable.size() == NextModuleId);
+
+  std::vector<StringRef> Paths;
+  Paths.resize(NextModuleId);
+
+  for (const auto &KV : ModuleTable) {
+    assert(Paths[KV.second.first].empty() && "IDs are unique and continuous");
+    Paths[KV.second.first] = KV.first();
+  }
+
+  return Paths;
+}
+
+GlobalValueSummary *
+ThinLtoModuleIndex::getSummary(GlobalValue::GUID Function) const {
+  ValueInfo VI = CombinedSummaryIndex.getValueInfo(Function);
+  if (!VI || VI.getSummaryList().empty())
+    return nullptr;
+#ifndef NDEBUG
+  if (VI.getSummaryList().size() > 1) {
+    LLVM_DEBUG(dbgs() << "SummaryList with multiple entries!\n");
+  }
+#endif
+  return VI.getSummaryList().front().get()->getBaseObject();
+}
+
+Optional<StringRef>
+ThinLtoModuleIndex::getModulePathForSymbol(StringRef Name) const {
+  if (GlobalValueSummary *S = getSummary(GlobalValue::getGUID(Name)))
+    return S->modulePath();
+  return None; // We don't know the symbol.
+}
+
+Error ThinLtoModuleIndex::scheduleModuleParsing(StringRef Path) {
+  // If the module was scheduled (or parsed) already, we can call takeModule().
+  std::lock_guard<std::mutex> Lock(ScheduledModulesLock);
+  auto ScheduledIt = ScheduledModules.find(Path);
+  if (ScheduledIt != ScheduledModules.end())
+    return Error::success();
+
+  // Schedule the requested module.
+  LLVM_DEBUG(dbgs() << "Schedule module for parsing: " << Path << "\n");
+  ScheduledModules[Path] =
+      ParseModuleWorkers.async([this, Path = Path.str()]() {
+        if (auto TSM = doParseModule(Path)) {
+          std::lock_guard<std::mutex> Lock(ParsedModulesLock);
+          ParsedModules[Path] = std::move(*TSM);
+
+          LLVM_DEBUG(dbgs() << "Finished parsing module: " << Path << "\n");
+        } else {
+          ES.reportError(TSM.takeError());
+        }
+      });
+
+  // We can call takeModule() now.
+  return Error::success();
+}
+
+ThreadSafeModule ThinLtoModuleIndex::takeModule(StringRef Path) {
+  std::unique_lock<std::mutex> ParseLock(ParsedModulesLock);
+
+  auto ParsedIt = ParsedModules.find(Path);
+  if (ParsedIt == ParsedModules.end()) {
+    ParseLock.unlock();
+
+    // The module is not ready, wait for the future we stored.
+    std::unique_lock<std::mutex> ScheduleLock(ScheduledModulesLock);
+    auto ScheduledIt = ScheduledModules.find(Path);
+    assert(ScheduledIt != ScheduledModules.end() &&
+           "Don't call for unscheduled modules");
+    std::shared_future<void> Future = ScheduledIt->getValue();
+    ScheduleLock.unlock();
+    Future.get();
+
+    ParseLock.lock();
+    ParsedIt = ParsedModules.find(Path);
+    assert(ParsedIt != ParsedModules.end() && "Must be ready now");
+  }
+
+  // We only add each module once. If it's not here anymore, we can skip it.
+  ThreadSafeModule TSM = std::move(ParsedIt->getValue());
+  ParsedIt->getValue() = ThreadSafeModule();
+  return TSM;
+}
+
+Expected<ThreadSafeModule>
+ThinLtoModuleIndex::parseModuleFromFile(StringRef Path) {
+  auto TSM = doParseModule(Path);
+  if (!TSM)
+    return TSM.takeError();
+
+  std::lock_guard<std::mutex> ScheduleLock(ScheduledModulesLock);
+  assert(ScheduledModules.empty() && "Only do this for the first module");
+
+  // Remember it was scheduled: store fulfilled future
+  std::promise<void> DummyPromise;
+  DummyPromise.set_value();
+  ScheduledModules[Path] = DummyPromise.get_future();
+
+  std::lock_guard<std::mutex> ParseLock(ParsedModulesLock);
+  assert(ParsedModules.empty() && "Only do this for the first module");
+
+  // Remember it was parsed: store Null-module (we don't go through
+  // takeModule() here)
+  ParsedModules[Path] = ThreadSafeModule();
+
+  return std::move(*TSM);
+}
+
+Expected<ThreadSafeModule> ThinLtoModuleIndex::doParseModule(StringRef Path) {
+  // TODO: make a SMDiagnosticError class for this
+  SMDiagnostic Err;
+  auto Ctx = std::make_unique<LLVMContext>();
+  auto M = parseIRFile(Path, Err, *Ctx);
+  if (!M) {
+    std::string ErrDescription;
+    {
+      raw_string_ostream S(ErrDescription);
+      Err.print("ThinLtoJIT", S);
+    }
+    return createStringError(inconvertibleErrorCode(),
+                             "Failed to load module from file '%s' (%s)",
+                             Path.data(), ErrDescription.c_str());
+  }
+
+  return ThreadSafeModule(std::move(M), std::move(Ctx));
+}
+
+// We don't filter visited functions. Discovery will often be retriggered
+// from the middle of already visited functions and it aims to reach a little
+// further each time.
+void ThinLtoModuleIndex::discoverCalleeModulePaths(FunctionSummary *S,
+                                                   unsigned LookaheadLevels) {
+  // Populate initial worklist
+  std::vector<FunctionSummary *> Worklist;
+  addToWorklist(Worklist, S->calls());
+  unsigned Distance = 0;
+
+  while (++Distance < LookaheadLevels) {
+    // Process current worklist and populate a new one.
+    std::vector<FunctionSummary *> NextWorklist;
+    for (FunctionSummary *F : Worklist) {
+      updatePathRank(F->modulePath(), Distance);
+      addToWorklist(NextWorklist, F->calls());
+    }
+    Worklist = std::move(NextWorklist);
+  }
+
+  // Process the last worklist without filling a new one
+  for (FunctionSummary *F : Worklist) {
+    updatePathRank(F->modulePath(), Distance);
+  }
+
+  // Drop known paths. This includes both, scheduled and parsed modules.
+  std::lock_guard<std::mutex> Lock(ScheduledModulesLock);
+  for (const auto &KV : ScheduledModules) {
+    StringRef Path = KV.first();
+    PathRank.erase(Path);
+  }
+}
+
+void ThinLtoModuleIndex::addToWorklist(
+    std::vector<FunctionSummary *> &List,
+    ArrayRef<FunctionSummary::EdgeTy> Calls) {
+  for (const auto &Edge : Calls) {
+    const auto &SummaryList = Edge.first.getSummaryList();
+    if (!SummaryList.empty()) {
+      GlobalValueSummary *S = SummaryList.front().get()->getBaseObject();
+      assert(isa<FunctionSummary>(S) && "Callees must be functions");
+      List.push_back(cast<FunctionSummary>(S));
+    }
+  }
+}
+
+// PathRank is global and continuous.
+void ThinLtoModuleIndex::updatePathRank(StringRef Path, unsigned Distance) {
+  auto &Entry = PathRank[Path];
+  Entry.Count += 1;
+  Entry.MinDist = std::min(Entry.MinDist, Distance);
+  assert(Entry.MinDist > 0 && "We want it as a divisor");
+};
+
+// PathRank is only modifed from DiscoveryThread, so it's safe to return its
+// keys as StringRef.
+std::vector<StringRef> ThinLtoModuleIndex::getNextPaths(unsigned Count) {
+  LLVM_DEBUG(dbgs() << "ModuleIndex: take " << std::min(Count, PathRank.size())
+                    << " out of " << PathRank.size() << " discovered paths\n");
+
+  return Count < PathRank.size() ? selectHotPaths(Count) : selectAllPaths();
+}
+
+std::vector<StringRef> ThinLtoModuleIndex::selectAllPaths() {
+  std::vector<StringRef> Paths;
+  Paths.reserve(PathRank.size());
+  for (const auto &KV : PathRank)
+    Paths.push_back(KV.first());
+  return Paths;
+}
+
+std::vector<StringRef> ThinLtoModuleIndex::selectHotPaths(unsigned Count) {
+  std::vector<StringRef> Paths;
+
+  struct ScorePath {
+    float Score;
+    unsigned MinDist;
+    StringRef Path;
+  };
+  std::vector<ScorePath> Candidates;
+  Candidates.reserve(PathRank.size());
+
+  for (const auto &KV : PathRank) {
+    float Score = static_cast<float>(KV.second.Count) / KV.second.MinDist;
+    Candidates.push_back({Score, KV.second.MinDist, KV.first()});
+  }
+
+  auto HighestScoreToFront = [](const ScorePath &LHS, const ScorePath &RHS) {
+    return LHS.Score > RHS.Score;
+  };
+  auto MinDistToFront = [](const ScorePath &LHS, const ScorePath &RHS) {
+    return LHS.MinDist < RHS.MinDist;
+  };
+
+  std::partial_sort(Candidates.begin(), Candidates.begin() + Count,
+                    Candidates.end(), std::move(HighestScoreToFront));
+  std::sort(Candidates.begin(), Candidates.begin() + Count,
+            std::move(MinDistToFront));
+
+  Paths.reserve(Count);
+  for (unsigned i = 0; i < Count; i++) {
+    Paths.push_back(Candidates[i].Path);
+  }
+
+  return Paths;
+}
+
+} // namespace orc
+} // namespace llvm
diff --git a/llvm/examples/ThinLtoJIT/bench b/llvm/examples/ThinLtoJIT/bench
new file mode 100755
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/bench
@@ -0,0 +1,89 @@
+#!/bin/bash
+#set -x
+
+if [ $# -gt 2 ]; then
+  TOOLS_DIR="$1"
+  SOURCE_DIR="$2"
+  MAIN_SOURCE_FILE="$3"
+else
+  echo "Usage: bench <path to llvm binaries> <path to c-sources> <main source file> [<override sysroot>]"
+  exit 1
+fi
+
+if [ $# -gt 3 ]; then
+  SYS_ROOT="$4"
+else
+  SYS_ROOT="/"
+fi
+
+function check_tool ()
+{
+  if [ -e "${TOOLS_DIR}/$1" ]; then
+    echo "Found: $1"
+  else
+    echo "!!! Cannot find required tool, please provide it in the LLVM binaries folder: $1"
+  fi
+}
+
+check_tool lli
+check_tool SpeculativeJIT
+check_tool ThinLtoJIT
+
+SKIP_BITCODE_GEN=0
+if [[ -e bc-default || -e bc-thinlto || -e ll-default || -e ll-thinlto ]]; then
+  echo "Skipping bitcode generation: output directories existing"
+  echo "Please clean up manually: rm -R bc-default bc-thinlto ll-default ll-thinlto"
+  SKIP_BITCODE_GEN=1
+else
+  check_tool clang
+  check_tool llvm-dis
+  mkdir bc-default
+  mkdir bc-thinlto
+  mkdir ll-default
+  mkdir ll-thinlto
+fi
+
+ROOT_DIR=$(pwd)
+ALL_BITCODE_FILES=""
+
+MAIN_FILE_BASENAME=$(basename "${MAIN_SOURCE_FILE%.c*}")
+LLI_EXTRA_MODULES=""
+
+for f in ${SOURCE_DIR}/*.c* ; do
+  BASE_NAME=$(basename "${f%.c*}")
+
+  if [ ${SKIP_BITCODE_GEN} -eq 0 ]; then
+    echo "Compile: $f -> ${BASE_NAME}.bc"
+
+    "${TOOLS_DIR}/clang" -c -I ${SOURCE_DIR} ${CFLAGS} -isysroot ${SYS_ROOT} -emit-llvm \
+                         -o "bc-default/${BASE_NAME}.bc" "$f"
+    "${TOOLS_DIR}/clang" -c -I ${SOURCE_DIR} ${CFLAGS} -isysroot ${SYS_ROOT} -flto=thin \
+                         -o "bc-thinlto/${BASE_NAME}.bc" "$f"
+
+    echo "Disassemble ${BASE_NAME}.bc -> ${BASE_NAME}.ll"
+    ${TOOLS_DIR}/llvm-dis bc-default/${BASE_NAME}.bc -o ll-default/${BASE_NAME}.ll
+    ${TOOLS_DIR}/llvm-dis bc-thinlto/${BASE_NAME}.bc -o ll-thinlto/${BASE_NAME}.ll
+  fi
+
+  ALL_BITCODE_FILES="${ALL_BITCODE_FILES} ${BASE_NAME}.bc"
+  if [ "${BASE_NAME}" != "${MAIN_FILE_BASENAME}" ]; then
+    LLI_EXTRA_MODULES="${LLI_EXTRA_MODULES} -extra-module=${BASE_NAME}.bc"
+  fi
+done
+
+set -x
+cd ${ROOT_DIR}/bc-default
+time (${TOOLS_DIR}/clang -o ${MAIN_FILE_BASENAME} -O0 ${ALL_BITCODE_FILES} && ./${MAIN_FILE_BASENAME} ${EXEC_ARGS} 1>/dev/null)
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=mcjit "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-mcjit "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -compile-threads=8 "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=8 "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=8 -O1 "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=8 -O0 "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null
+time ${TOOLS_DIR}/SpeculativeJIT -num-threads=8 ${ALL_BITCODE_FILES} --args ${EXEC_ARGS} 1>/dev/null
+
+cd ${ROOT_DIR}/bc-thinlto
+#time (${TOOLS_DIR}/clang -flto=thin -o test ${ALL_BITCODE_FILES} && ./test ${EXEC_ARGS} 1>/dev/null)
+time ${TOOLS_DIR}/ThinLtoJIT ${ALL_BITCODE_FILES} ${EXEC_ARGS} --args 1>/dev/null
diff --git a/llvm/examples/ThinLtoJIT/main.cpp b/llvm/examples/ThinLtoJIT/main.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/main.cpp
@@ -0,0 +1,80 @@
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/TargetSelect.h"
+
+#include "ThinLtoInstrumentationLayer.h"
+#include "ThinLtoJIT.h"
+
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+static cl::list<std::string> InputFiles(cl::Positional, cl::OneOrMore,
+                                        cl::desc("<bitcode files>"));
+
+static cl::list<std::string> InputArgs("args", cl::Positional,
+                                       cl::desc("<program arguments>..."),
+                                       cl::ZeroOrMore, cl::PositionalEatsArgs);
+
+static cl::opt<unsigned> CompileThreads("compile-threads", cl::Optional,
+                                        cl::desc("Number of compile threads"),
+                                        cl::init(4));
+
+static cl::opt<unsigned> LoadThreads("load-threads", cl::Optional,
+                                     cl::desc("Number of module load threads"),
+                                     cl::init(8));
+
+static cl::opt<unsigned>
+    LookaheadLevels("lookahead", cl::Optional,
+                    cl::desc("Calls to look ahead of execution"), cl::init(4));
+
+static cl::opt<unsigned> DiscoveryFlagsBucketSize(
+    "discovery-flag-bucket-size", cl::Optional,
+    cl::desc("Flags per bucket (rounds up to memory pages)"), cl::init(4096));
+
+static cl::opt<orc::ThinLtoInstrumentationLayer::ExplicitMemoryBarrier>
+    MemFence("mem-fence",
+             cl::desc("Control memory fences for cache synchronization"),
+             cl::init(orc::ThinLtoInstrumentationLayer::Always),
+             cl::values(clEnumValN(orc::ThinLtoInstrumentationLayer::Never,
+                                   "never", "No use of memory fences"),
+                        clEnumValN(orc::ThinLtoInstrumentationLayer::StaticCode,
+                                   "static",
+                                   "Use of memory fences in static code only"),
+                        clEnumValN(orc::ThinLtoInstrumentationLayer::JITedCode,
+                                   "jited",
+                                   "Install memory fences in JITed code only"),
+                        clEnumValN(orc::ThinLtoInstrumentationLayer::Always,
+                                   "always", "Always use of memory fences")));
+
+static cl::opt<bool>
+    AllowNudge("allow-nudge",
+               cl::desc("Allow the symbol generator to nudge symbols into "
+                        "discovery even though they haven't been reached"),
+               cl::init(false));
+
+int main(int argc, char *argv[]) {
+  InitLLVM X(argc, argv);
+  InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+  cl::ParseCommandLineOptions(argc, argv, "ThinLtoJIT");
+
+  Error Err = Error::success();
+  auto atLeastOne = [](unsigned N) { return std::max(1u, N); };
+
+  orc::ThinLtoJIT Jit(InputFiles, "main", atLeastOne(LookaheadLevels),
+                      atLeastOne(CompileThreads), atLeastOne(LoadThreads),
+                      DiscoveryFlagsBucketSize, MemFence, AllowNudge, Err);
+  if (Err) {
+    logAllUnhandledErrors(std::move(Err), errs(), "ThinLtoJIT: ");
+    exit(1);
+  }
+
+  ExitOnError ExitOnErr;
+  ExitOnErr.setBanner("ThinLtoJIT: ");
+
+  return ExitOnErr(Jit.main(InputArgs));
+}