diff --git a/llvm/examples/CMakeLists.txt b/llvm/examples/CMakeLists.txt
--- a/llvm/examples/CMakeLists.txt
+++ b/llvm/examples/CMakeLists.txt
@@ -8,6 +8,7 @@
 add_subdirectory(ModuleMaker)
 add_subdirectory(SpeculativeJIT)
 add_subdirectory(Bye)
+add_subdirectory(ThinLtoJIT)
 
 if(LLVM_ENABLE_EH AND (NOT WIN32) AND (NOT "${LLVM_NATIVE_ARCH}" STREQUAL "ARM"))
     add_subdirectory(ExceptionDemo)
diff --git a/llvm/examples/ThinLtoJIT/CMakeLists.txt b/llvm/examples/ThinLtoJIT/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/CMakeLists.txt
@@ -0,0 +1,21 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  IRReader
+  OrcJIT
+  ExecutionEngine
+  Support
+  nativecodegen
+  Analysis
+  Passes
+  )
+
+# Disable warning for passing variable length argv to JITed main function.
+set_property(SOURCE main.cpp APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-vla-extension")
+
+add_llvm_example(ThinLtoJIT
+  main.cpp
+  ThinLtoJIT.cpp
+  ThinLtoModuleIndex.cpp
+  ThinLtoInstrumentationLayer.cpp
+  ThinLtoDiscoveryThread.cpp
+  )
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h
@@ -0,0 +1,45 @@
+#ifndef LLVM_EXAMPLES_THINLTOJIT_DISCOVERYTHREAD_H
+#define LLVM_EXAMPLES_THINLTOJIT_DISCOVERYTHREAD_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+
+#include "ThinLtoJIT.h"
+
+#include <atomic>
+#include <set>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+class ThinLtoModuleIndex;
+class ThinLtoInstrumentationLayer;
+
+class ThinLtoDiscoveryThread {
+public:
+  ThinLtoDiscoveryThread(std::atomic<bool> &RunningFlag,
+                         ThinLtoInstrumentationLayer &L,
+                         ThinLtoModuleIndex &GlobalIndex,
+                         unsigned LookaheadLevels,
+                         ThinLtoJIT::AddModuleFunction AddModule)
+      : KeepRunning(RunningFlag), Layer(L), GlobalIndex(GlobalIndex),
+        AddModule(std::move(AddModule)), LookaheadLevels(LookaheadLevels) {}
+
+  void operator()();
+
+private:
+  std::atomic<bool> &KeepRunning;
+  ThinLtoInstrumentationLayer &Layer;
+  ThinLtoModuleIndex &GlobalIndex;
+  ThinLtoJIT::AddModuleFunction AddModule;
+  unsigned LookaheadLevels;
+
+  std::set<StringRef> discoverCalleeModulePaths(FunctionSummary *S,
+                                                unsigned Levels);
+};
+
+} // namespace orc
+} // namespace llvm
+
+#endif
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp
@@ -0,0 +1,104 @@
+#include "ThinLtoDiscoveryThread.h"
+
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+
+#include "ThinLtoInstrumentationLayer.h"
+#include "ThinLtoModuleIndex.h"
+
+#define DEBUG_TYPE "thinltojit"
+
+namespace llvm {
+namespace orc {
+
+void ThinLtoDiscoveryThread::operator()() {
+  while (KeepRunning.load()) {
+    std::vector<unsigned> Indexes = Layer.takeFlagsThatFired();
+
+    // TODO: Dispatch actual module loading into a thread pool.
+    if (!Indexes.empty()) {
+      LLVM_DEBUG(dbgs() << Indexes.size() << " new flags raised\n");
+      auto ReachedFunctions = Layer.takeFlagOwners(std::move(Indexes));
+
+      // While traversing the call graph, collect the modules we cross.
+      std::set<StringRef> Paths;
+      for (GlobalValue::GUID F : ReachedFunctions) {
+        if (GlobalValueSummary *S = GlobalIndex.getSummary(F)) {
+          if (isa<FunctionSummary>(S)) {
+            std::set<StringRef> NewPaths = discoverCalleeModulePaths(
+                cast<FunctionSummary>(S), LookaheadLevels - 1);
+            Paths.insert(NewPaths.begin(), NewPaths.end());
+          } else {
+            LLVM_DEBUG(dbgs()
+                       << "Reached symbol is not a function: " << F << "\n");
+          }
+        }
+      }
+
+#ifndef NDEBUG
+      unsigned Added = 0;
+#endif
+      for (StringRef M : Paths) {
+        Expected<Optional<ThreadSafeModule>> TSM =
+            GlobalIndex.parseNewModuleFromFile(M);
+        if (!TSM) {
+          // Failed to parse the module.
+          Layer.getExecutionSession().reportError(TSM.takeError());
+          continue;
+        }
+
+        if (!*TSM)
+          // This module was added already.
+          continue;
+
+        if (Error LoadErr = AddModule(std::move(**TSM))) {
+          // Failed to load the module.
+          Layer.getExecutionSession().reportError(std::move(LoadErr));
+        }
+
+#ifndef NDEBUG
+        ++Added;
+#endif
+      }
+
+      LLVM_DEBUG(dbgs() << "DiscoveryThread: " << Added << " new modules "
+                        << "(" << Paths.size() - Added << " known modules)\n");
+    }
+  }
+}
+
+// We don't filter visited functions here. Discovery will often be retriggered
+// from the middle of already visited functions and aims to reach a little
+// further each time.
+std::set<StringRef>
+ThinLtoDiscoveryThread::discoverCalleeModulePaths(FunctionSummary *S,
+                                                  unsigned Levels) {
+  // Summaries for function callees must be FunctionSummaries.
+  auto getCalleeSummary = [](const ValueInfo &VI) -> FunctionSummary * {
+    const auto &SummaryList = VI.getSummaryList();
+    if (SummaryList.empty())
+      return nullptr;
+    return cast<FunctionSummary>(SummaryList.front().get()->getBaseObject());
+  };
+
+  // TODO: The paths we discover should be weighed, e.g. by number of edges that
+  // lead there and the minimal distance to go?
+  std::set<StringRef> Paths;
+  bool VisitChildren = (Levels > 0);
+
+  for (const auto &Edge : S->calls()) {
+    if (FunctionSummary *CalleeSummary = getCalleeSummary(Edge.first)) {
+      Paths.insert(CalleeSummary->modulePath());
+      if (VisitChildren) {
+        std::set<StringRef> ChildPaths =
+            discoverCalleeModulePaths(CalleeSummary, Levels - 1);
+        Paths.insert(ChildPaths.begin(), ChildPaths.end());
+      }
+    }
+  }
+  return Paths;
+}
+
+} // namespace orc
+} // namespace llvm
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h
@@ -0,0 +1,78 @@
+#ifndef LLVM_EXAMPLES_THINLTOJIT_DISCOVERYLAYER_H
+#define LLVM_EXAMPLES_THINLTOJIT_DISCOVERYLAYER_H
+
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/Layer.h"
+#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+
+#include <atomic>
+#include <cstdint>
+#include <map>
+#include <mutex>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+class ThinLtoInstrumentationLayer : public IRLayer {
+public:
+  enum ExplicitMemoryBarrier {
+    Never = 0,
+    StaticCode = 1,
+    JITedCode = 2,
+    Always = 3
+  };
+
+  ThinLtoInstrumentationLayer(ExecutionSession &ES, IRCompileLayer &BaseLayer,
+                              ExplicitMemoryBarrier InsertMemBarrier,
+                              unsigned FlagsPerBatch)
+      : IRLayer(ES), BaseLayer(BaseLayer), InsertMemBarrier(InsertMemBarrier) {
+    // TODO: So far we only allocate one batch.
+    allocateDiscoveryFlags(FlagsPerBatch);
+  }
+
+  ~ThinLtoInstrumentationLayer() override;
+
+  void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
+
+  unsigned reserveDiscoveryFlags(unsigned Count);
+  void registerDiscoveryFlagOwners(std::vector<GlobalValue::GUID> Guids,
+                                   unsigned FirstIdx);
+
+  void nudgeIntoDiscovery(std::vector<GlobalValue::GUID> Functions);
+
+  std::vector<unsigned> takeFlagsThatFired();
+  std::vector<GlobalValue::GUID> takeFlagOwners(std::vector<unsigned> Indexes);
+
+private:
+  IRCompileLayer &BaseLayer;
+  ExplicitMemoryBarrier InsertMemBarrier;
+
+  enum Flag : uint8_t { Clear = 0, Fired = 1 };
+
+  // Lock-free read access.
+  uint8_t *FlagsStorage;
+  Flag *FlagsIncoming; // lock-free write by design
+  Flag *FlagsHandled;
+  unsigned NumFlagsAllocated;
+  std::atomic<unsigned> NumFlagsUsed; // spin-lock
+
+  // Acquire/release sync between writers and reader
+  std::atomic<uint64_t> FlagsSync;
+
+  // STL container requires locking for both, read and write access.
+  mutable std::mutex DiscoveryFlagsInfoLock;
+  std::map<unsigned, GlobalValue::GUID> FlagOwnersMap;
+
+  void allocateDiscoveryFlags(unsigned MinFlags);
+  void compileFunctionReachedFlagSetter(IRBuilder<> &B, Flag *F);
+};
+
+} // namespace orc
+} // namespace llvm
+
+#endif
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp
@@ -0,0 +1,227 @@
+#include "ThinLtoInstrumentationLayer.h"
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Process.h"
+
+#include <cstdlib>
+
+#define DEBUG_TYPE "thinltojit"
+
+namespace llvm {
+namespace orc {
+
+// TODO: Fixed set of flags may not always be enough. Make this expandable.
+void ThinLtoInstrumentationLayer::allocateDiscoveryFlags(unsigned MinFlags) {
+  // Round up to full memory pages.
+  unsigned PageSize = sys::Process::getPageSizeEstimate();
+  unsigned NumPagesEach = (MinFlags + (PageSize - 1)) / PageSize;
+  unsigned NumPagesTotal = 2 * NumPagesEach;
+  assert(isPowerOf2_64(PageSize) && "Adjust aligned memory alloc below");
+
+  // Allocate one more page to make up for size loss due to alignment.
+  void *Storage = std::calloc(NumPagesTotal + 1, PageSize);
+  uint64_t StorageAddr = reinterpret_cast<uint64_t>(Storage);
+  uint64_t PageSizeDecr = PageSize - 1;
+  uint64_t AlignedAddr = ((StorageAddr + PageSizeDecr) & ~PageSizeDecr);
+  uint64_t Diff = AlignedAddr - StorageAddr;
+
+  // For each flag we allocate one byte in each location: Incoming and Handled.
+  // TODO: 'Handled' could be a bitset, but size must be dynamic
+  NumFlagsUsed.store(0);
+  NumFlagsAllocated = NumPagesEach * PageSize;
+  FlagsStorage = static_cast<uint8_t *>(Storage);
+  FlagsIncoming = reinterpret_cast<Flag *>(FlagsStorage + Diff);
+  FlagsHandled = FlagsIncoming + NumFlagsAllocated;
+
+  static_assert(sizeof(FlagsIncoming[0]) == sizeof(uint8_t), "Flags are bytes");
+  assert(reinterpret_cast<uint64_t>(FlagsIncoming) % PageSize == 0);
+  assert(reinterpret_cast<uint64_t>(FlagsHandled) % PageSize == 0);
+  assert(NumFlagsAllocated >= MinFlags);
+}
+
+unsigned ThinLtoInstrumentationLayer::reserveDiscoveryFlags(unsigned Count) {
+  assert(Count > 0);
+  unsigned Before, After;
+  do {
+    Before = NumFlagsUsed.load();
+    After = Before + Count;
+  } while (!NumFlagsUsed.compare_exchange_weak(Before, After));
+
+#ifndef NDEBUG
+  for (unsigned i = Before; i < After; i++) {
+    assert(FlagsIncoming[i] == Clear);
+  }
+#endif
+
+  return Before; // First reserved index
+}
+
+void ThinLtoInstrumentationLayer::registerDiscoveryFlagOwners(
+    std::vector<GlobalValue::GUID> Guids, unsigned FirstIdx) {
+  unsigned Count = Guids.size();
+
+  std::lock_guard<std::mutex> Lock(DiscoveryFlagsInfoLock);
+  for (unsigned i = 0; i < Count; i++) {
+    assert(!FlagOwnersMap.count(FirstIdx + i) &&
+           "Flag should not have an owner at this point");
+    FlagOwnersMap[FirstIdx + i] = Guids[i];
+  }
+}
+
+std::vector<unsigned> ThinLtoInstrumentationLayer::takeFlagsThatFired() {
+  // This is only effective with the respective Release.
+  FlagsSync.load(std::memory_order_acquire);
+
+  std::vector<unsigned> Indexes;
+  unsigned NumIndexesUsed = NumFlagsUsed.load();
+  for (unsigned i = 0; i < NumIndexesUsed; i++) {
+    if (FlagsIncoming[i] == Fired && FlagsHandled[i] == Clear) {
+      FlagsHandled[i] = Fired;
+      Indexes.push_back(i);
+    }
+  }
+
+  return Indexes;
+}
+
+std::vector<GlobalValue::GUID>
+ThinLtoInstrumentationLayer::takeFlagOwners(std::vector<unsigned> Indexes) {
+  std::vector<GlobalValue::GUID> ReachedFunctions;
+  std::lock_guard<std::mutex> Lock(DiscoveryFlagsInfoLock);
+
+  for (unsigned i : Indexes) {
+    auto KV = FlagOwnersMap.find(i);
+    assert(KV != FlagOwnersMap.end());
+    ReachedFunctions.push_back(KV->second);
+    FlagOwnersMap.erase(KV);
+  }
+
+  return ReachedFunctions;
+}
+
+void ThinLtoInstrumentationLayer::nudgeIntoDiscovery(
+    std::vector<GlobalValue::GUID> Functions) {
+  unsigned Count = Functions.size();
+
+  // Registering synthetic flags in advance. We expect them to get processed
+  // before the respective functions get emitted. If not, the emit() function
+  unsigned FirstFlagIdx = reserveDiscoveryFlags(Functions.size());
+  registerDiscoveryFlagOwners(std::move(Functions), FirstFlagIdx);
+
+  // Initialize the flags as fired and force a cache sync, so discovery will
+  // pick them up as soon as possible.
+  for (unsigned i = FirstFlagIdx; i < FirstFlagIdx + Count; i++) {
+    FlagsIncoming[i] = Fired;
+  }
+  if (InsertMemBarrier & ExplicitMemoryBarrier::StaticCode) {
+    FlagsSync.store(0, std::memory_order_release);
+  }
+
+  LLVM_DEBUG(dbgs() << "Nudged " << Count << " new functions into discovery\n");
+}
+
+void ThinLtoInstrumentationLayer::emit(MaterializationResponsibility R,
+                                       ThreadSafeModule TSM) {
+  TSM.withModuleDo([this](Module &M) {
+    std::vector<Function *> FunctionsToInstrument;
+
+    // We may have discovered ahead of some functions already, but we still
+    // instrument them all. Their notifications steer the future direction of
+    // discovery.
+    for (Function &F : M.getFunctionList())
+      if (!F.isDeclaration())
+        FunctionsToInstrument.push_back(&F);
+
+    if (!FunctionsToInstrument.empty()) {
+      IRBuilder<> B(M.getContext());
+      std::vector<GlobalValue::GUID> NewDiscoveryRoots;
+
+      // Flags that fire must have owners registered. We will do it below and
+      // that's fine, because they can only be reached once the code is emitted.
+      unsigned FirstFlagIdx =
+          reserveDiscoveryFlags(FunctionsToInstrument.size());
+
+      unsigned NextFlagIdx = FirstFlagIdx;
+      for (Function *F : FunctionsToInstrument) {
+
+        BasicBlock *E = &F->getEntryBlock();
+        B.SetInsertPoint(BasicBlock::Create(
+            M.getContext(), "NotifyFunctionReachedProlog", F, E));
+        compileFunctionReachedFlagSetter(B, FlagsIncoming + NextFlagIdx);
+        B.CreateBr(E);
+
+        NewDiscoveryRoots.push_back(GlobalValue::getGUID(F->getName()));
+        ++NextFlagIdx;
+      }
+
+      LLVM_DEBUG(dbgs() << "Instrumented " << NewDiscoveryRoots.size()
+                        << " new functions in module " << M.getName() << "\n");
+
+      // Submit owner info, so the DiscoveryThread can evaluate the flags.
+      registerDiscoveryFlagOwners(std::move(NewDiscoveryRoots), FirstFlagIdx);
+    }
+  });
+
+  BaseLayer.emit(std::move(R), std::move(TSM));
+}
+
+void ThinLtoInstrumentationLayer::compileFunctionReachedFlagSetter(
+    IRBuilder<> &B, Flag *F) {
+  assert(*F == Clear);
+  Type *Int64Ty = Type::getInt64Ty(B.getContext());
+
+  // Write one immediate 8bit value to a fixed location in memory.
+  auto FlagAddr = pointerToJITTargetAddress(F);
+  Type *FlagTy = Type::getInt8Ty(B.getContext());
+  B.CreateStore(ConstantInt::get(FlagTy, Fired),
+                B.CreateIntToPtr(ConstantInt::get(Int64Ty, FlagAddr),
+                                 FlagTy->getPointerTo()));
+
+  if (InsertMemBarrier & ExplicitMemoryBarrier::JITedCode) {
+    // Overwrite the sync value with Release ordering. The discovery thread
+    // reads it with Acquire ordering. The actual value doesn't matter.
+    static constexpr bool IsVolatile = true;
+    static constexpr Instruction *NoInsertBefore = nullptr;
+    auto SyncFlagAddr = pointerToJITTargetAddress(&FlagsSync);
+
+    B.Insert(
+        new StoreInst(ConstantInt::get(Int64Ty, 0),
+                      B.CreateIntToPtr(ConstantInt::get(Int64Ty, SyncFlagAddr),
+                                       Int64Ty->getPointerTo()),
+                      IsVolatile, MaybeAlign(64), AtomicOrdering::Release,
+                      SyncScope::System, NoInsertBefore));
+  }
+}
+
+ThinLtoInstrumentationLayer::~ThinLtoInstrumentationLayer() {
+  LLVM_DEBUG({
+    dbgs() << "Discovery flags stats\n";
+
+    unsigned NumFlagsFired = 0;
+    for (unsigned i = 0; i < NumFlagsAllocated; i++) {
+      if (FlagsIncoming[i] == Fired)
+        ++NumFlagsFired;
+    }
+    dbgs() << "Alloc:  " << format("%6.d", NumFlagsAllocated) << "\n";
+    dbgs() << "Issued: " << format("%6.d", NumFlagsUsed.load()) << "\n";
+    dbgs() << "Fired:  " << format("%6.d", NumFlagsFired) << "\n";
+
+    unsigned RemainingFlagOwners = 0;
+    for (const auto &_ : FlagOwnersMap) {
+      ++RemainingFlagOwners;
+      (void)_;
+    }
+    dbgs() << "\nFlagOwnersMap has " << RemainingFlagOwners
+           << " remaining entries.\n";
+  });
+
+  std::free(FlagsStorage);
+}
+
+} // namespace orc
+} // namespace llvm
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.h b/llvm/examples/ThinLtoJIT/ThinLtoJIT.h
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoJIT.h
@@ -0,0 +1,107 @@
+#ifndef LLVM_EXAMPLES_THINLTOJIT_THINLTOJIT_H
+#define LLVM_EXAMPLES_THINLTOJIT_THINLTOJIT_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
+#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ThreadPool.h"
+
+#include "ThinLtoInstrumentationLayer.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+class ThinLtoModuleIndex;
+class ThinLtoDiscoveryThread;
+
+class RTDyldObjectLinkingLayer;
+class IRCompileLayer;
+class CompileOnDemandLayer;
+
+class JITDylib;
+class MangleAndInterner;
+class LazyCallThroughManager;
+
+class ThinLtoJIT {
+public:
+  using AddModuleFunction = std::function<Error(ThreadSafeModule)>;
+
+  ThinLtoJIT(std::vector<StringRef> ModuleFiles, StringRef MainFunctionName,
+             unsigned NumCompileThreads, unsigned DiscoveryFlagsPerBatch,
+             ThinLtoInstrumentationLayer::ExplicitMemoryBarrier MemFence,
+             bool AllowNudgeIntoDiscovery,
+             Error &Err);
+  ~ThinLtoJIT();
+
+  ThinLtoJIT(const ThinLtoJIT &) = delete;
+  ThinLtoJIT &operator=(const ThinLtoJIT &) = delete;
+  ThinLtoJIT(ThinLtoJIT &&) = delete;
+  ThinLtoJIT &operator=(ThinLtoJIT &&) = delete;
+
+  Expected<int> main(int argc, const char *argv[]) {
+    auto MainSym = ES.lookup({MainJD}, MainFunctionMangled);
+    if (!MainSym)
+      return MainSym.takeError();
+
+    auto Main = jitTargetAddressToFunction<int (*)(int, const char *[])>(
+        MainSym->getAddress());
+    return Main(argc, argv);
+  }
+
+private:
+  ExecutionSession ES;
+  std::unique_ptr<DataLayout> DL;
+
+  // Wrapper class to allow late construction of the mangler while preserving
+  // the conventional Mangle(SymbolName) syntax.
+  // TODO: Alternatively the original MangleAndInterner could be turned into a
+  // regular class (default-constructible and assignable).
+  struct MangleWrapper {
+    SymbolStringPtr operator()(StringRef S) { return Impl->operator()(S); }
+    std::unique_ptr<MangleAndInterner> Impl{nullptr};
+  };
+
+  MangleWrapper Mangle;
+
+  JITDylib *MainJD;
+  SymbolStringPtr MainFunctionMangled;
+  std::unique_ptr<ThreadPool> CompileThreads;
+  std::unique_ptr<ThinLtoModuleIndex> GlobalIndex;
+
+  AddModuleFunction AddModule;
+  AddModuleFunction AddModuleAndLookup;
+  std::unique_ptr<RTDyldObjectLinkingLayer> ObjLinkingLayer;
+  std::unique_ptr<IRCompileLayer> CompileLayer;
+  std::unique_ptr<ThinLtoInstrumentationLayer> InstrumentationLayer;
+  std::unique_ptr<CompileOnDemandLayer> OnDemandLayer;
+
+  std::atomic<bool> JitRunning;
+  std::unique_ptr<ThinLtoDiscoveryThread> DiscoveryThreadWorker;
+  std::unique_ptr<LazyCallThroughManager> CallThroughManager;
+
+  Error
+  setupLayers(Triple TT, unsigned DiscoveryFlagsPerBatch,
+              ThinLtoInstrumentationLayer::ExplicitMemoryBarrier MemFence);
+  Error setupJITDylib(JITDylib *JD, bool AllowNudge);
+  Error setupDiscovery(unsigned NumCompileThreads);
+  Expected<ThreadSafeModule> setupMainModule(StringRef MainFunction);
+
+  static void exitOnLazyCallThroughFailure() {
+    errs() << "Compilation failed. Aborting.\n";
+    exit(1);
+  }
+};
+
+} // namespace orc
+} // namespace llvm
+
+#endif
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp
@@ -0,0 +1,313 @@
+#include "ThinLtoJIT.h"
+
+#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/Support/Debug.h"
+
+#include "ThinLtoDiscoveryThread.h"
+#include "ThinLtoInstrumentationLayer.h"
+#include "ThinLtoModuleIndex.h"
+
+#include <set>
+#include <string>
+#include <thread>
+
+#ifndef NDEBUG
+#include <chrono>
+#endif
+
+#define DEBUG_TYPE "thinltojit"
+
+namespace llvm {
+namespace orc {
+
+class ThinLtoDefinitionGenerator : public JITDylib::DefinitionGenerator {
+public:
+  ThinLtoDefinitionGenerator(ThinLtoModuleIndex &GlobalIndex,
+                             ThinLtoInstrumentationLayer &InstrumentationLayer,
+                             ThinLtoJIT::AddModuleFunction AddModule,
+                             char ManglePrefix, bool AllowNudge)
+      : GlobalIndex(GlobalIndex), InstrumentationLayer(InstrumentationLayer),
+        AddModule(std::move(AddModule)), ManglePrefix(ManglePrefix),
+        AllowNudgeIntoDiscovery(AllowNudge) {}
+
+  Error tryToGenerate(LookupKind K, JITDylib &JD,
+                      JITDylibLookupFlags JDLookupFlags,
+                      const SymbolLookupSet &Symbols) override;
+
+private:
+  ThinLtoModuleIndex &GlobalIndex;
+  ThinLtoInstrumentationLayer &InstrumentationLayer;
+  ThinLtoJIT::AddModuleFunction AddModule;
+  char ManglePrefix;
+  bool AllowNudgeIntoDiscovery;
+
+  // ThinLTO summaries encode unprefixed names.
+  StringRef stripGlobalManglePrefix(StringRef Symbol) const {
+    bool Strip = (ManglePrefix != '\0' && Symbol[0] == ManglePrefix);
+    return Strip ? StringRef(Symbol.data() + 1, Symbol.size() - 1) : Symbol;
+  }
+};
+
+Error ThinLtoDefinitionGenerator::tryToGenerate(
+    LookupKind K, JITDylib &JD, JITDylibLookupFlags JDLookupFlags,
+    const SymbolLookupSet &Symbols) {
+#ifndef NDEBUG
+  unsigned Added = 0;
+#endif
+
+  std::vector<GlobalValue::GUID> NewDiscoveryRoots;
+  StringMap<std::vector<StringRef>> SymbolNamesByModulePath;
+
+  for (const auto &KV : Symbols) {
+    StringRef Name = stripGlobalManglePrefix(*KV.first);
+    auto Guid = GlobalValue::getGUID(Name);
+
+    if (GlobalValueSummary *S = GlobalIndex.getSummary(Guid)) {
+      std::vector<StringRef> &Names = SymbolNamesByModulePath[S->modulePath()];
+      Names.push_back(Name);
+      if (AllowNudgeIntoDiscovery && isa<FunctionSummary>(S)) {
+        NewDiscoveryRoots.push_back(GlobalValue::getGUID(Name));
+      }
+    }
+  }
+
+  for (const auto &KV : SymbolNamesByModulePath) {
+    Expected<Optional<ThreadSafeModule>> TSM =
+        GlobalIndex.parseNewModuleFromFile(KV.first());
+    if (!TSM) {
+      // Parsing the module from disk failed, after we successfully obtained
+      // ValueInfos for its symbols from ThinLTO summaries.
+      return TSM.takeError();
+    }
+
+    // We did parse the module already, but the add request is waiting "outside"
+    // for the lock that will be freed once this request is done. There appears
+    // to be no way to temporarily suspend the request and get the symbol ready.
+    // Thus, we must parse the module here again and submit it before returning.
+    // It's quite expensive as we are actively blocking execution at this point.
+    if (!*TSM) {
+      InstrumentationLayer.getExecutionSession().reportError(
+          createStringError(inconvertibleErrorCode(),
+                            "Module %s required for symbol %s was added while "
+                            "request for it was in progress. Reparsing!",
+                            KV.first().data(), KV.second.front().data()));
+
+      static constexpr bool ForceLoad = true;
+      TSM = GlobalIndex.parseNewModuleFromFile(KV.first(), ForceLoad);
+      if (!TSM) {
+        // Parsing the module from disk failed. This may happen any time.
+        return TSM.takeError();
+      }
+      assert(*TSM && "We forced the load operation");
+    }
+
+    if (Error LoadErr = AddModule(std::move(**TSM)))
+      // Found a module but failed to add it.
+      return LoadErr;
+
+#ifndef NDEBUG
+    ++Added;
+#endif
+  }
+
+  LLVM_DEBUG(dbgs() << "Generator: Added " << Added << " new modules\n");
+
+  // We can anticipate to run into the requested functions as soon as execution
+  // continues. Thus, we may trigger discovery flags for them already now to
+  // initiate discovery behind them. We will probably compile a few unnecessary
+  // things in this case.
+  if (!NewDiscoveryRoots.empty()) {
+    // The registration involves locking a mutex, so better do it in a
+    // separate thread.
+    std::thread(
+        [this](std::vector<GlobalValue::GUID> Rs) {
+          InstrumentationLayer.nudgeIntoDiscovery(std::move(Rs));
+        },
+        std::move(NewDiscoveryRoots))
+        .detach();
+  }
+
+  return Error::success();
+}
+
+ThinLtoJIT::ThinLtoJIT(
+    std::vector<StringRef> ModuleFiles, StringRef MainFunctionName,
+    unsigned NumCompileThreads, unsigned DiscoveryFlagsPerBatch,
+    ThinLtoInstrumentationLayer::ExplicitMemoryBarrier MemFence,
+    bool AllowNudgeIntoDiscovery, Error &Err) {
+  ErrorAsOutParameter ErrAsOutParam(&Err);
+
+  GlobalIndex = std::make_unique<ThinLtoModuleIndex>(ES);
+  for (StringRef F : ModuleFiles) {
+    if (auto Err = GlobalIndex->add(F))
+      ES.reportError(std::move(Err));
+  }
+
+  auto TSM = setupMainModule(MainFunctionName);
+  if (!TSM) {
+    Err = TSM.takeError();
+    return;
+  }
+
+  ThreadSafeModule MainModule = std::move(*TSM);
+  DL = std::make_unique<DataLayout>(MainModule.getModuleUnlocked());
+
+  // Now that we know the target data layout we can setup the mangler.
+  Mangle.Impl = std::make_unique<MangleAndInterner>(ES, *DL);
+  MainFunctionMangled = Mangle(MainFunctionName);
+
+  Err = setupLayers(Triple(MainModule.getModuleUnlocked()->getTargetTriple()),
+                    DiscoveryFlagsPerBatch, MemFence);
+  if (Err)
+    return;
+
+  Err = setupDiscovery(NumCompileThreads);
+  if (Err)
+    return;
+
+  MainJD = &ES.createJITDylib("main");
+  Err = setupJITDylib(MainJD, AllowNudgeIntoDiscovery);
+  if (Err)
+    return;
+
+  Err = AddModule(std::move(MainModule));
+  if (Err)
+    return;
+
+  if (AllowNudgeIntoDiscovery) {
+    auto MainFunctionGuid = GlobalValue::getGUID(MainFunctionName);
+    InstrumentationLayer->nudgeIntoDiscovery({MainFunctionGuid});
+  }
+
+#ifndef NDEBUG
+  // Uncomment to give the discovery thread some time do things.
+  // std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+#endif
+}
+
+Expected<ThreadSafeModule> ThinLtoJIT::setupMainModule(StringRef MainFunction) {
+  Optional<StringRef> M = GlobalIndex->getModulePathForSymbol(MainFunction);
+  if (!M) {
+    std::string Buffer;
+    raw_string_ostream OS(Buffer);
+    OS << "No ValueInfo for symbol '" << MainFunction;
+    OS << "' in provided modules: ";
+    for (StringRef P : GlobalIndex->getAllModulePaths())
+      OS << P << " ";
+    OS << "\n";
+    return createStringError(inconvertibleErrorCode(), OS.str());
+  }
+  Expected<Optional<ThreadSafeModule>> TSM =
+      GlobalIndex->parseNewModuleFromFile(*M);
+  if (!TSM)
+    return TSM.takeError();
+  assert(*TSM && "This is the first module, it cannot exist yet");
+  return std::move(**TSM);
+}
+
+Error ThinLtoJIT::setupLayers(
+    Triple TT, unsigned DiscoveryFlagsPerBatch,
+    ThinLtoInstrumentationLayer::ExplicitMemoryBarrier MemFence) {
+  ObjLinkingLayer = std::make_unique<RTDyldObjectLinkingLayer>(
+      ES, []() { return std::make_unique<SectionMemoryManager>(); });
+  CompileLayer = std::make_unique<IRCompileLayer>(
+      ES, *ObjLinkingLayer, ConcurrentIRCompiler(JITTargetMachineBuilder(TT)));
+
+  InstrumentationLayer = std::make_unique<ThinLtoInstrumentationLayer>(
+      ES, *CompileLayer, MemFence, DiscoveryFlagsPerBatch);
+
+  auto ISMB = createLocalIndirectStubsManagerBuilder(TT);
+  auto LCTM = createLocalLazyCallThroughManager(
+      TT, ES, pointerToJITTargetAddress(exitOnLazyCallThroughFailure));
+  if (!LCTM)
+    return LCTM.takeError();
+
+  CallThroughManager = std::move(*LCTM);
+  OnDemandLayer = std::make_unique<CompileOnDemandLayer>(
+      ES, *InstrumentationLayer, *CallThroughManager, std::move(ISMB));
+  // Don't break up modules. Insert stubs on module boundaries.
+  OnDemandLayer->setPartitionFunction(CompileOnDemandLayer::compileWholeModule);
+
+  AddModule = [this](ThreadSafeModule TSM) -> Error {
+    assert(MainJD && "Setup MainJD JITDylib before calling");
+    return OnDemandLayer->add(*MainJD, std::move(TSM));
+  };
+
+  return Error::success();
+}
+
+Error ThinLtoJIT::setupDiscovery(unsigned NumCompileThreads) {
+  // Delegate compilation to the thread pool.
+  CompileThreads = std::make_unique<ThreadPool>(NumCompileThreads);
+  ES.setDispatchMaterialization(
+      [this](JITDylib &JD, std::shared_ptr<MaterializationUnit> MU) {
+        CompileThreads->async([MU, &JD]() { MU->doMaterialize(JD); });
+      });
+
+  unsigned LookaheadLevels = 3;
+
+#ifndef NDEBUG
+  // Uncomment to avoid discovering all at once when debugging small examples.
+  // LookaheadLevels = 1;
+#endif
+
+  // Spawn discovery thread and let it add newly discovered modules to the JIT.
+  // We lookup one symbol right away to force immediately materialization.
+  JitRunning.store(true);
+  DiscoveryThreadWorker = std::make_unique<ThinLtoDiscoveryThread>(
+      JitRunning, *InstrumentationLayer, *GlobalIndex, LookaheadLevels,
+      [this](ThreadSafeModule TSM) -> Error {
+        std::string FunctionNameForLookup =
+            TSM.getModuleUnlocked()->getFunctionList().front().getName().str();
+        Error SubmitErr = AddModule(std::move(TSM));
+        if (SubmitErr)
+          return SubmitErr;
+        // TODO: This is quite workaroundish. CompileOnDemandLayer could have a
+        // flag to force materialization without an extra lookup like this.
+        auto LookupRes = ES.lookup({MainJD}, Mangle(FunctionNameForLookup));
+        if (!LookupRes)
+          return LookupRes.takeError();
+        assert(LookupRes->getAddress() && "Function should be emitted now");
+        return Error::success();
+      });
+
+  std::thread(std::ref(*DiscoveryThreadWorker)).detach();
+  return Error::success();
+}
+
+Error ThinLtoJIT::setupJITDylib(JITDylib *JD, bool AllowNudge) {
+  // Register symbols for C++ static destructors.
+  LocalCXXRuntimeOverrides CXXRuntimeoverrides;
+  Error Err = CXXRuntimeoverrides.enable(*JD, *Mangle.Impl);
+  if (Err)
+    return Err;
+
+  // Lookup symbol names in the global ThinLTO module index first
+  char Prefix = DL->getGlobalPrefix();
+  JD->addGenerator(std::make_unique<ThinLtoDefinitionGenerator>(
+      *GlobalIndex, *InstrumentationLayer, AddModule, Prefix, AllowNudge));
+  // Then try lookup in the host process.
+  auto HostLookup = DynamicLibrarySearchGenerator::GetForCurrentProcess(Prefix);
+  if (!HostLookup)
+    return HostLookup.takeError();
+  JD->addGenerator(std::move(*HostLookup));
+
+  return Error::success();
+}
+
+ThinLtoJIT::~ThinLtoJIT() {
+  // Signal the DiscoveryThread to shut down.
+  JitRunning.store(false);
+  // Wait for potential compile actions to finish.
+  CompileThreads->wait();
+}
+
+} // namespace orc
+} // namespace llvm
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h
@@ -0,0 +1,47 @@
+#ifndef LLVM_EXAMPLES_THINLTOJIT_THINLTOJITMODULEINDEX_H
+#define LLVM_EXAMPLES_THINLTOJIT_THINLTOJITMODULEINDEX_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/Support/Error.h"
+
+#include <cstdint>
+#include <mutex>
+#include <set>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+class SymbolStringPtr;
+
+class ThinLtoModuleIndex {
+  static constexpr bool HaveGVs = false;
+
+public:
+  ThinLtoModuleIndex(ExecutionSession &ES)
+      : ES(ES), CombinedSummaryIndex(HaveGVs), NextModuleId(0) {}
+
+  Error add(StringRef ModulePath);
+  GlobalValueSummary *getSummary(GlobalValue::GUID Function) const;
+  std::vector<StringRef> getAllModulePaths() const;
+  Optional<StringRef> getModulePathForSymbol(StringRef Name) const;
+  Expected<Optional<ThreadSafeModule>>
+  parseNewModuleFromFile(StringRef Path, bool ForceLoad = false);
+
+private:
+  ExecutionSession &ES;
+  ModuleSummaryIndex CombinedSummaryIndex;
+  uint64_t NextModuleId;
+
+  std::mutex ParsedModulesLock;
+  std::set<SymbolStringPtr> ParsedModules;
+};
+
+} // namespace orc
+} // namespace llvm
+
+#endif
diff --git a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp
@@ -0,0 +1,94 @@
+#include "ThinLtoModuleIndex.h"
+
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <memory>
+#include <string>
+
+#define DEBUG_TYPE "thinltojit"
+
+namespace llvm {
+namespace orc {
+
+Error ThinLtoModuleIndex::add(StringRef ModulePath) {
+  auto Buffer = errorOrToExpected(MemoryBuffer::getFile(ModulePath));
+  if (!Buffer)
+    return Buffer.takeError();
+
+  Error ParseErr = readModuleSummaryIndex((*Buffer)->getMemBufferRef(),
+                                          CombinedSummaryIndex, NextModuleId);
+  if (ParseErr)
+    return ParseErr;
+
+  ++NextModuleId;
+  return Error::success();
+}
+
+std::vector<StringRef> ThinLtoModuleIndex::getAllModulePaths() const {
+  std::vector<StringRef> Paths;
+  for (const auto &KV : CombinedSummaryIndex.modulePaths()) {
+    StringRef Path = KV.first();
+    Paths.push_back(Path);
+  }
+  return Paths;
+}
+
+GlobalValueSummary *
+ThinLtoModuleIndex::getSummary(GlobalValue::GUID Function) const {
+  ValueInfo VI = CombinedSummaryIndex.getValueInfo(Function);
+  if (!VI || VI.getSummaryList().empty())
+    return nullptr;
+#ifndef NDEBUG
+  if (VI.getSummaryList().size() > 1) {
+    LLVM_DEBUG(dbgs() << "SummaryList with multiple entries!\n");
+  }
+#endif
+  return VI.getSummaryList().front().get()->getBaseObject();
+}
+
+Optional<StringRef>
+ThinLtoModuleIndex::getModulePathForSymbol(StringRef Name) const {
+  if (GlobalValueSummary *S = getSummary(GlobalValue::getGUID(Name)))
+    return S->modulePath();
+  return None; // We don't know the symbol.
+}
+
+Expected<Optional<ThreadSafeModule>>
+ThinLtoModuleIndex::parseNewModuleFromFile(StringRef Path, bool ForceLoad) {
+  if (!ForceLoad) {
+    std::lock_guard<std::mutex> Lock(ParsedModulesLock);
+
+    SymbolStringPtr PathId = ES.intern(Path);
+    auto It = ParsedModules.find(PathId);
+    if (It != ParsedModules.end() && !ForceLoad)
+      // This is not a new module.
+      return None;
+
+    ParsedModules.insert(PathId);
+  }
+
+  // TODO: make a SMDiagnosticError class for this
+  SMDiagnostic Err;
+  auto Ctx = std::make_unique<LLVMContext>();
+  auto M = parseIRFile(Path, Err, *Ctx);
+  if (!M) {
+    std::string ErrDescription;
+    {
+      raw_string_ostream S(ErrDescription);
+      Err.print("ThinLtoJIT", S);
+    }
+    return createStringError(inconvertibleErrorCode(),
+                             "Failed load module from file '%s' (%s)",
+                             Path.data(), ErrDescription.c_str());
+  }
+
+  return ThreadSafeModule(std::move(M), std::move(Ctx));
+}
+
+} // namespace orc
+} // namespace llvm
diff --git a/llvm/examples/ThinLtoJIT/bench b/llvm/examples/ThinLtoJIT/bench
new file mode 100755
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/bench
@@ -0,0 +1,89 @@
+#!/bin/bash
+#set -x
+
+if [ $# -gt 2 ]; then
+  TOOLS_DIR="$1"
+  SOURCE_DIR="$2"
+  MAIN_SOURCE_FILE="$3"
+else
+  echo "Usage: bench <path to llvm binaries> <path to c-sources> <main source file> [<override sysroot>]"
+  exit 1
+fi
+
+if [ $# -gt 3 ]; then
+  SYS_ROOT="$4"
+else
+  SYS_ROOT="/"
+fi
+
+function check_tool ()
+{
+  if [ -e "${TOOLS_DIR}/$1" ]; then
+    echo "Found: $1"
+  else
+    echo "!!! Cannot find required tool, please provide it in the LLVM binaries folder: $1"
+  fi
+}
+
+check_tool lli
+check_tool SpeculativeJIT
+check_tool ThinLtoJIT
+
+SKIP_BITCODE_GEN=0
+if [[ -e bc-default || -e bc-thinlto || -e ll-default || -e ll-thinlto ]]; then
+  echo "Skipping bitcode generation: output directories existing"
+  echo "Please clean up manually: rm -R bc-default bc-thinlto ll-default ll-thinlto"
+  SKIP_BITCODE_GEN=1
+else
+  check_tool clang
+  check_tool llvm-dis
+  mkdir bc-default
+  mkdir bc-thinlto
+  mkdir ll-default
+  mkdir ll-thinlto
+fi
+
+ROOT_DIR=$(pwd)
+ALL_BITCODE_FILES=""
+
+MAIN_FILE_BASENAME=$(basename "${MAIN_SOURCE_FILE%.c*}")
+LLI_EXTRA_MODULES=""
+
+for f in ${SOURCE_DIR}/*.c* ; do
+  BASE_NAME=$(basename "${f%.c*}")
+
+  if [ ${SKIP_BITCODE_GEN} -eq 0 ]; then
+    echo "Compile: $f -> ${BASE_NAME}.bc"
+
+    "${TOOLS_DIR}/clang" -c -I ${SOURCE_DIR} -isysroot ${SYS_ROOT} -emit-llvm \
+                        -o "bc-default/${BASE_NAME}.bc" "$f"
+    "${TOOLS_DIR}/clang" -c -I ${SOURCE_DIR} -isysroot ${SYS_ROOT} -flto=thin \
+                        -o "bc-thinlto/${BASE_NAME}.bc" "$f"
+
+    echo "Disassemble ${BASE_NAME}.bc -> ${BASE_NAME}.ll"
+    ${TOOLS_DIR}/llvm-dis bc-default/${BASE_NAME}.bc -o ll-default/${BASE_NAME}.ll
+    ${TOOLS_DIR}/llvm-dis bc-thinlto/${BASE_NAME}.bc -o ll-thinlto/${BASE_NAME}.ll
+  fi
+
+  ALL_BITCODE_FILES="${ALL_BITCODE_FILES} ${BASE_NAME}.bc"
+  if [ "${BASE_NAME}" != "${MAIN_FILE_BASENAME}" ]; then
+    LLI_EXTRA_MODULES="${LLI_EXTRA_MODULES} -extra-module=${BASE_NAME}.bc"
+  fi
+done
+
+set -x
+cd ${ROOT_DIR}/bc-default
+time (${TOOLS_DIR}/clang -o ${MAIN_FILE_BASENAME} -O0 ${ALL_BITCODE_FILES} && ./${MAIN_FILE_BASENAME} 1>/dev/null)
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=mcjit "${MAIN_FILE_BASENAME}.bc" 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-mcjit "${MAIN_FILE_BASENAME}.bc" 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy "${MAIN_FILE_BASENAME}.bc" 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -compile-threads=2 "${MAIN_FILE_BASENAME}.bc" 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy "${MAIN_FILE_BASENAME}.bc" 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=2 "${MAIN_FILE_BASENAME}.bc" 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=2 -O1 "${MAIN_FILE_BASENAME}.bc" 1>/dev/null
+time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=2 -O0 "${MAIN_FILE_BASENAME}.bc" 1>/dev/null
+time ${TOOLS_DIR}/SpeculativeJIT -num-threads=2 ${ALL_BITCODE_FILES} 1>/dev/null
+
+cd ${ROOT_DIR}/bc-thinlto
+#time (${TOOLS_DIR}/clang -flto=thin -o test ${ALL_BITCODE_FILES} && ./test 1>/dev/null)
+time ${TOOLS_DIR}/ThinLtoJIT -compile-threads=2 ${ALL_BITCODE_FILES} 1>/dev/null
diff --git a/llvm/examples/ThinLtoJIT/main.cpp b/llvm/examples/ThinLtoJIT/main.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/examples/ThinLtoJIT/main.cpp
@@ -0,0 +1,82 @@
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/TargetSelect.h"
+
+#include "ThinLtoInstrumentationLayer.h"
+#include "ThinLtoJIT.h"
+
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+static cl::list<std::string> InputFiles(cl::Positional, cl::OneOrMore,
+                                        cl::desc("<bitcode files>"));
+
+static cl::list<std::string> InputArgs("args", cl::Positional,
+                                       cl::desc("<program arguments>..."),
+                                       cl::ZeroOrMore, cl::PositionalEatsArgs);
+
+static cl::opt<unsigned> CompileThreads("compile-threads", cl::Optional,
+                                        cl::desc("Number of compile threads"),
+                                        cl::init(4));
+
+static cl::opt<unsigned> DiscoveryFlagsBatchSize(
+    "discovery-flag-batch-size", cl::Optional,
+    cl::desc("Number of discovery flags allocated in one go"), cl::init(4096));
+
+static cl::opt<orc::ThinLtoInstrumentationLayer::ExplicitMemoryBarrier>
+    MemFence(
+        "mem-fence",
+        cl::desc(
+            "Choose where to install memory fences for cache synchronization"),
+        cl::init(orc::ThinLtoInstrumentationLayer::Always),
+        cl::values(clEnumValN(orc::ThinLtoInstrumentationLayer::Never, "never",
+                              "No use of memory fences"),
+                   clEnumValN(orc::ThinLtoInstrumentationLayer::StaticCode,
+                              "static",
+                              "Use of memory fences in static code only"),
+                   clEnumValN(orc::ThinLtoInstrumentationLayer::JITedCode,
+                              "jited",
+                              "Install memory fences in JITed code only"),
+                   clEnumValN(orc::ThinLtoInstrumentationLayer::Always,
+                              "always", "Always use of memory fences")));
+
+static cl::opt<bool> AllowNudge("allow-nudge",
+                                cl::desc("Allow the symbol generator to nudge symbols into discovery although they haven't been reached"),
+                                cl::init(true));
+
+int main(int argc, char *argv[]) {
+  InitLLVM X(argc, argv);
+  InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+  cl::ParseCommandLineOptions(argc, argv, "ThinLtoJIT");
+
+  std::vector<StringRef> ModuleFiles;
+  for (const std::string &F : InputFiles) {
+    ModuleFiles.push_back(F);
+  }
+
+  Error Err = Error::success();
+  unsigned Threads = std::max(1u, static_cast<unsigned>(CompileThreads));
+
+  orc::ThinLtoJIT Jit(std::move(ModuleFiles), "main", Threads,
+                      DiscoveryFlagsBatchSize, MemFence, AllowNudge, Err);
+  if (Err) {
+    logAllUnhandledErrors(std::move(Err), errs(), "ThinLtoJIT: ");
+    exit(1);
+  }
+
+  unsigned JitMainArgc = InputArgs.size();
+  const char *JitMainArgv[JitMainArgc];
+  for (unsigned i = 0; i < JitMainArgc; i++) {
+    JitMainArgv[i] = InputArgs[i].c_str();
+  }
+
+  ExitOnError ExitOnErr;
+  ExitOnErr.setBanner("ThinLtoJIT: ");
+
+  return ExitOnErr(Jit.main(JitMainArgc, JitMainArgv));
+}