diff --git a/llvm/examples/CMakeLists.txt b/llvm/examples/CMakeLists.txt --- a/llvm/examples/CMakeLists.txt +++ b/llvm/examples/CMakeLists.txt @@ -8,6 +8,7 @@ add_subdirectory(ModuleMaker) add_subdirectory(SpeculativeJIT) add_subdirectory(Bye) +add_subdirectory(ThinLtoJIT) if(LLVM_ENABLE_EH AND (NOT WIN32) AND (NOT "${LLVM_NATIVE_ARCH}" STREQUAL "ARM")) add_subdirectory(ExceptionDemo) diff --git a/llvm/examples/ThinLtoJIT/CMakeLists.txt b/llvm/examples/ThinLtoJIT/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/CMakeLists.txt @@ -0,0 +1,18 @@ +set(LLVM_LINK_COMPONENTS + Core + IRReader + OrcJIT + ExecutionEngine + Support + nativecodegen + Analysis + Passes + ) + +add_llvm_example(ThinLtoJIT + main.cpp + ThinLtoJIT.cpp + ThinLtoModuleIndex.cpp + ThinLtoInstrumentationLayer.cpp + ThinLtoDiscoveryThread.cpp + ) diff --git a/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h new file mode 100644 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h @@ -0,0 +1,45 @@ +#ifndef LLVM_EXAMPLES_THINLTOJIT_DISCOVERYTHREAD_H +#define LLVM_EXAMPLES_THINLTOJIT_DISCOVERYTHREAD_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/ModuleSummaryIndex.h" + +#include "ThinLtoJIT.h" + +#include +#include +#include + +namespace llvm { +namespace orc { + +class ThinLtoModuleIndex; +class ThinLtoInstrumentationLayer; + +class ThinLtoDiscoveryThread { +public: + ThinLtoDiscoveryThread(std::atomic &RunningFlag, + ThinLtoInstrumentationLayer &L, + ThinLtoModuleIndex &GlobalIndex, + unsigned LookaheadLevels, + ThinLtoJIT::AddModuleFunction AddModule) + : KeepRunning(RunningFlag), Layer(L), GlobalIndex(GlobalIndex), + AddModule(std::move(AddModule)), LookaheadLevels(LookaheadLevels) {} + + void operator()(); + +private: + std::atomic &KeepRunning; + ThinLtoInstrumentationLayer &Layer; + ThinLtoModuleIndex &GlobalIndex; + ThinLtoJIT::AddModuleFunction AddModule; + unsigned LookaheadLevels; + + std::set discoverCalleeModulePaths(FunctionSummary *S, + unsigned Levels); +}; + +} // namespace orc +} // namespace llvm + +#endif diff --git a/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp new file mode 100644 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp @@ -0,0 +1,104 @@ +#include "ThinLtoDiscoveryThread.h" + +#include "llvm/IR/GlobalValue.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" + +#include "ThinLtoInstrumentationLayer.h" +#include "ThinLtoModuleIndex.h" + +#define DEBUG_TYPE "thinltojit" + +namespace llvm { +namespace orc { + +void ThinLtoDiscoveryThread::operator()() { + while (KeepRunning.load()) { + std::vector Indexes = Layer.takeFlagsThatFired(); + + // TODO: Dispatch actual module loading into a thread pool. + if (!Indexes.empty()) { + LLVM_DEBUG(dbgs() << Indexes.size() << " new flags raised\n"); + auto ReachedFunctions = Layer.takeFlagOwners(std::move(Indexes)); + + // While traversing the call graph, collect the modules we cross. + std::set Paths; + for (GlobalValue::GUID F : ReachedFunctions) { + if (GlobalValueSummary *S = GlobalIndex.getSummary(F)) { + if (isa(S)) { + std::set NewPaths = discoverCalleeModulePaths( + cast(S), LookaheadLevels - 1); + Paths.insert(NewPaths.begin(), NewPaths.end()); + } else { + LLVM_DEBUG(dbgs() + << "Reached symbol is not a function: " << F << "\n"); + } + } + } + +#ifndef NDEBUG + unsigned Added = 0; +#endif + for (StringRef M : Paths) { + Expected> TSM = + GlobalIndex.parseNewModuleFromFile(M); + if (!TSM) { + // Failed to parse the module. + Layer.getExecutionSession().reportError(TSM.takeError()); + continue; + } + + if (!*TSM) + // This module was added already. + continue; + + if (Error LoadErr = AddModule(std::move(**TSM))) { + // Failed to load the module. + Layer.getExecutionSession().reportError(std::move(LoadErr)); + } + +#ifndef NDEBUG + ++Added; +#endif + } + + LLVM_DEBUG(dbgs() << "DiscoveryThread: " << Added << " new modules " + << "(" << Paths.size() - Added << " known modules)\n"); + } + } +} + +// We don't filter visited functions here. Discovery will often be retriggered +// from the middle of already visited functions and aims to reach a little +// further each time. +std::set +ThinLtoDiscoveryThread::discoverCalleeModulePaths(FunctionSummary *S, + unsigned Levels) { + // Summaries for function callees must be FunctionSummaries. + auto getCalleeSummary = [](const ValueInfo &VI) -> FunctionSummary * { + const auto &SummaryList = VI.getSummaryList(); + if (SummaryList.empty()) + return nullptr; + return cast(SummaryList.front().get()->getBaseObject()); + }; + + // TODO: The paths we discover should be weighed, e.g. by number of edges that + // lead there and the minimal distance to go? + std::set Paths; + bool VisitChildren = (Levels > 0); + + for (const auto &Edge : S->calls()) { + if (FunctionSummary *CalleeSummary = getCalleeSummary(Edge.first)) { + Paths.insert(CalleeSummary->modulePath()); + if (VisitChildren) { + std::set ChildPaths = + discoverCalleeModulePaths(CalleeSummary, Levels - 1); + Paths.insert(ChildPaths.begin(), ChildPaths.end()); + } + } + } + return Paths; +} + +} // namespace orc +} // namespace llvm diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h new file mode 100644 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h @@ -0,0 +1,78 @@ +#ifndef LLVM_EXAMPLES_THINLTOJIT_DISCOVERYLAYER_H +#define LLVM_EXAMPLES_THINLTOJIT_DISCOVERYLAYER_H + +#include "llvm/ExecutionEngine/JITSymbol.h" +#include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" +#include "llvm/ExecutionEngine/Orc/Layer.h" +#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" + +#include +#include +#include +#include +#include + +namespace llvm { +namespace orc { + +class ThinLtoInstrumentationLayer : public IRLayer { +public: + enum ExplicitMemoryBarrier { + Never = 0, + StaticCode = 1, + JITedCode = 2, + Always = 3 + }; + + ThinLtoInstrumentationLayer(ExecutionSession &ES, IRCompileLayer &BaseLayer, + ExplicitMemoryBarrier InsertMemBarrier, + unsigned FlagsPerBatch) + : IRLayer(ES), BaseLayer(BaseLayer), InsertMemBarrier(InsertMemBarrier) { + // TODO: So far we only allocate one batch. + allocateDiscoveryFlags(FlagsPerBatch); + } + + ~ThinLtoInstrumentationLayer() override; + + void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override; + + unsigned reserveDiscoveryFlags(unsigned Count); + void registerDiscoveryFlagOwners(std::vector Guids, + unsigned FirstIdx); + + void nudgeIntoDiscovery(std::vector Functions); + + std::vector takeFlagsThatFired(); + std::vector takeFlagOwners(std::vector Indexes); + +private: + IRCompileLayer &BaseLayer; + ExplicitMemoryBarrier InsertMemBarrier; + + enum Flag : uint8_t { Clear = 0, Fired = 1 }; + + // Lock-free read access. + uint8_t *FlagsStorage; + Flag *FlagsIncoming; // lock-free write by design + Flag *FlagsHandled; + unsigned NumFlagsAllocated; + std::atomic NumFlagsUsed; // spin-lock + + // Acquire/release sync between writers and reader + std::atomic FlagsSync; + + // STL container requires locking for both, read and write access. + mutable std::mutex DiscoveryFlagsInfoLock; + std::map FlagOwnersMap; + + void allocateDiscoveryFlags(unsigned MinFlags); + void compileFunctionReachedFlagSetter(IRBuilder<> &B, Flag *F); +}; + +} // namespace orc +} // namespace llvm + +#endif diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp new file mode 100644 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp @@ -0,0 +1,227 @@ +#include "ThinLtoInstrumentationLayer.h" + +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Process.h" + +#include + +#define DEBUG_TYPE "thinltojit" + +namespace llvm { +namespace orc { + +// TODO: Fixed set of flags may not always be enough. Make this expandable. +void ThinLtoInstrumentationLayer::allocateDiscoveryFlags(unsigned MinFlags) { + // Round up to full memory pages. + unsigned PageSize = sys::Process::getPageSizeEstimate(); + unsigned NumPagesEach = (MinFlags + (PageSize - 1)) / PageSize; + unsigned NumPagesTotal = 2 * NumPagesEach; + assert(isPowerOf2_64(PageSize) && "Adjust aligned memory alloc below"); + + // Allocate one more page to make up for size loss due to alignment. + void *Storage = std::calloc(NumPagesTotal + 1, PageSize); + uint64_t StorageAddr = reinterpret_cast(Storage); + uint64_t PageSizeDecr = PageSize - 1; + uint64_t AlignedAddr = ((StorageAddr + PageSizeDecr) & ~PageSizeDecr); + uint64_t Diff = AlignedAddr - StorageAddr; + + // For each flag we allocate one byte in each location: Incoming and Handled. + // TODO: 'Handled' could be a bitset, but size must be dynamic + NumFlagsUsed.store(0); + NumFlagsAllocated = NumPagesEach * PageSize; + FlagsStorage = static_cast(Storage); + FlagsIncoming = reinterpret_cast(FlagsStorage + Diff); + FlagsHandled = FlagsIncoming + NumFlagsAllocated; + + static_assert(sizeof(FlagsIncoming[0]) == sizeof(uint8_t), "Flags are bytes"); + assert(reinterpret_cast(FlagsIncoming) % PageSize == 0); + assert(reinterpret_cast(FlagsHandled) % PageSize == 0); + assert(NumFlagsAllocated >= MinFlags); +} + +unsigned ThinLtoInstrumentationLayer::reserveDiscoveryFlags(unsigned Count) { + assert(Count > 0); + unsigned Before, After; + do { + Before = NumFlagsUsed.load(); + After = Before + Count; + } while (!NumFlagsUsed.compare_exchange_weak(Before, After)); + +#ifndef NDEBUG + for (unsigned i = Before; i < After; i++) { + assert(FlagsIncoming[i] == Clear); + } +#endif + + return Before; // First reserved index +} + +void ThinLtoInstrumentationLayer::registerDiscoveryFlagOwners( + std::vector Guids, unsigned FirstIdx) { + unsigned Count = Guids.size(); + + std::lock_guard Lock(DiscoveryFlagsInfoLock); + for (unsigned i = 0; i < Count; i++) { + assert(!FlagOwnersMap.count(FirstIdx + i) && + "Flag should not have an owner at this point"); + FlagOwnersMap[FirstIdx + i] = Guids[i]; + } +} + +std::vector ThinLtoInstrumentationLayer::takeFlagsThatFired() { + // This is only effective with the respective Release. + FlagsSync.load(std::memory_order_acquire); + + std::vector Indexes; + unsigned NumIndexesUsed = NumFlagsUsed.load(); + for (unsigned i = 0; i < NumIndexesUsed; i++) { + if (FlagsIncoming[i] == Fired && FlagsHandled[i] == Clear) { + FlagsHandled[i] = Fired; + Indexes.push_back(i); + } + } + + return Indexes; +} + +std::vector +ThinLtoInstrumentationLayer::takeFlagOwners(std::vector Indexes) { + std::vector ReachedFunctions; + std::lock_guard Lock(DiscoveryFlagsInfoLock); + + for (unsigned i : Indexes) { + auto KV = FlagOwnersMap.find(i); + assert(KV != FlagOwnersMap.end()); + ReachedFunctions.push_back(KV->second); + FlagOwnersMap.erase(KV); + } + + return ReachedFunctions; +} + +void ThinLtoInstrumentationLayer::nudgeIntoDiscovery( + std::vector Functions) { + unsigned Count = Functions.size(); + + // Registering synthetic flags in advance. We expect them to get processed + // before the respective functions get emitted. If not, the emit() function + unsigned FirstFlagIdx = reserveDiscoveryFlags(Functions.size()); + registerDiscoveryFlagOwners(std::move(Functions), FirstFlagIdx); + + // Initialize the flags as fired and force a cache sync, so discovery will + // pick them up as soon as possible. + for (unsigned i = FirstFlagIdx; i < FirstFlagIdx + Count; i++) { + FlagsIncoming[i] = Fired; + } + if (InsertMemBarrier & ExplicitMemoryBarrier::StaticCode) { + FlagsSync.store(0, std::memory_order_release); + } + + LLVM_DEBUG(dbgs() << "Nudged " << Count << " new functions into discovery\n"); +} + +void ThinLtoInstrumentationLayer::emit(MaterializationResponsibility R, + ThreadSafeModule TSM) { + TSM.withModuleDo([this](Module &M) { + std::vector FunctionsToInstrument; + + // We may have discovered ahead of some functions already, but we still + // instrument them all. Their notifications steer the future direction of + // discovery. + for (Function &F : M.getFunctionList()) + if (!F.isDeclaration()) + FunctionsToInstrument.push_back(&F); + + if (!FunctionsToInstrument.empty()) { + IRBuilder<> B(M.getContext()); + std::vector NewDiscoveryRoots; + + // Flags that fire must have owners registered. We will do it below and + // that's fine, because they can only be reached once the code is emitted. + unsigned FirstFlagIdx = + reserveDiscoveryFlags(FunctionsToInstrument.size()); + + unsigned NextFlagIdx = FirstFlagIdx; + for (Function *F : FunctionsToInstrument) { + + BasicBlock *E = &F->getEntryBlock(); + B.SetInsertPoint(BasicBlock::Create( + M.getContext(), "NotifyFunctionReachedProlog", F, E)); + compileFunctionReachedFlagSetter(B, FlagsIncoming + NextFlagIdx); + B.CreateBr(E); + + NewDiscoveryRoots.push_back(GlobalValue::getGUID(F->getName())); + ++NextFlagIdx; + } + + LLVM_DEBUG(dbgs() << "Instrumented " << NewDiscoveryRoots.size() + << " new functions in module " << M.getName() << "\n"); + + // Submit owner info, so the DiscoveryThread can evaluate the flags. + registerDiscoveryFlagOwners(std::move(NewDiscoveryRoots), FirstFlagIdx); + } + }); + + BaseLayer.emit(std::move(R), std::move(TSM)); +} + +void ThinLtoInstrumentationLayer::compileFunctionReachedFlagSetter( + IRBuilder<> &B, Flag *F) { + assert(*F == Clear); + Type *Int64Ty = Type::getInt64Ty(B.getContext()); + + // Write one immediate 8bit value to a fixed location in memory. + auto FlagAddr = pointerToJITTargetAddress(F); + Type *FlagTy = Type::getInt8Ty(B.getContext()); + B.CreateStore(ConstantInt::get(FlagTy, Fired), + B.CreateIntToPtr(ConstantInt::get(Int64Ty, FlagAddr), + FlagTy->getPointerTo())); + + if (InsertMemBarrier & ExplicitMemoryBarrier::JITedCode) { + // Overwrite the sync value with Release ordering. The discovery thread + // reads it with Acquire ordering. The actual value doesn't matter. + static constexpr bool IsVolatile = true; + static constexpr Instruction *NoInsertBefore = nullptr; + auto SyncFlagAddr = pointerToJITTargetAddress(&FlagsSync); + + B.Insert( + new StoreInst(ConstantInt::get(Int64Ty, 0), + B.CreateIntToPtr(ConstantInt::get(Int64Ty, SyncFlagAddr), + Int64Ty->getPointerTo()), + IsVolatile, MaybeAlign(64), AtomicOrdering::Release, + SyncScope::System, NoInsertBefore)); + } +} + +ThinLtoInstrumentationLayer::~ThinLtoInstrumentationLayer() { + LLVM_DEBUG({ + dbgs() << "Discovery flags stats\n"; + + unsigned NumFlagsFired = 0; + for (unsigned i = 0; i < NumFlagsAllocated; i++) { + if (FlagsIncoming[i] == Fired) + ++NumFlagsFired; + } + dbgs() << "Alloc: " << format("%6.d", NumFlagsAllocated) << "\n"; + dbgs() << "Issued: " << format("%6.d", NumFlagsUsed.load()) << "\n"; + dbgs() << "Fired: " << format("%6.d", NumFlagsFired) << "\n"; + + unsigned RemainingFlagOwners = 0; + for (const auto &_ : FlagOwnersMap) { + ++RemainingFlagOwners; + (void)_; + } + dbgs() << "\nFlagOwnersMap has " << RemainingFlagOwners + << " remaining entries.\n"; + }); + + std::free(FlagsStorage); +} + +} // namespace orc +} // namespace llvm diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.h b/llvm/examples/ThinLtoJIT/ThinLtoJIT.h new file mode 100644 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoJIT.h @@ -0,0 +1,114 @@ +#ifndef LLVM_EXAMPLES_THINLTOJIT_THINLTOJIT_H +#define LLVM_EXAMPLES_THINLTOJIT_THINLTOJIT_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" +#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h" +#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ThreadPool.h" + +#include "ThinLtoInstrumentationLayer.h" + +#include +#include +#include +#include + +namespace llvm { +namespace orc { + +class ThinLtoModuleIndex; +class ThinLtoDiscoveryThread; + +class RTDyldObjectLinkingLayer; +class IRCompileLayer; +class CompileOnDemandLayer; + +class JITDylib; +class MangleAndInterner; +class LazyCallThroughManager; + +class ThinLtoJIT { +public: + using AddModuleFunction = std::function; + + ThinLtoJIT(ArrayRef ModuleFiles, StringRef MainFunctionName, + unsigned LookaheadLevels, unsigned NumCompileThreads, + unsigned DiscoveryFlagsPerBatch, + ThinLtoInstrumentationLayer::ExplicitMemoryBarrier MemFence, + bool LookupOnAdd, bool AllowNudgeIntoDiscovery, Error &Err); + ~ThinLtoJIT(); + + ThinLtoJIT(const ThinLtoJIT &) = delete; + ThinLtoJIT &operator=(const ThinLtoJIT &) = delete; + ThinLtoJIT(ThinLtoJIT &&) = delete; + ThinLtoJIT &operator=(ThinLtoJIT &&) = delete; + + Expected main(ArrayRef Args) { + auto MainSym = ES.lookup({MainJD}, MainFunctionMangled); + if (!MainSym) + return MainSym.takeError(); + + using MainFn = int(int, char *[]); + auto Main = jitTargetAddressToFunction(MainSym->getAddress()); + + return runAsMain(Main, Args, StringRef("ThinLtoJIT")); + } + +private: + ExecutionSession ES; + std::unique_ptr DL; + + // Local convenience class to allow late construction of the mangler while + // preserving the conventional Mangle(SymbolName) syntax. + struct MangleWrapper { + SymbolStringPtr operator()(StringRef S) { return Impl->operator()(S); } + char getGlobalPrefix() { return DL->getGlobalPrefix(); } + void init(ExecutionSession &ES, Module *M) { + DL = std::make_unique(M); + Impl = std::make_unique(ES, *DL); + } + std::unique_ptr Impl{nullptr}; + std::unique_ptr DL{nullptr}; + }; + + MangleWrapper Mangle; + + JITDylib *MainJD; + SymbolStringPtr MainFunctionMangled; + std::unique_ptr CompileThreads; + std::unique_ptr GlobalIndex; + + AddModuleFunction AddModule; + AddModuleFunction AddModuleAndLookup; + std::unique_ptr ObjLinkingLayer; + std::unique_ptr CompileLayer; + std::unique_ptr InstrumentationLayer; + std::unique_ptr OnDemandLayer; + + std::atomic JitRunning; + std::unique_ptr DiscoveryThreadWorker; + std::unique_ptr CallThroughManager; + + Error + setupLayers(Triple TT, unsigned DiscoveryFlagsPerBatch, + ThinLtoInstrumentationLayer::ExplicitMemoryBarrier MemFence); + Error setupJITDylib(JITDylib *JD, bool AllowNudge); + Error setupDiscovery(unsigned NumCompileThreads, unsigned LookaheadLevels, + bool LookupOnAdd); + Expected setupMainModule(StringRef MainFunction); + + static void exitOnLazyCallThroughFailure() { + errs() << "Compilation failed. Aborting.\n"; + exit(1); + } +}; + +} // namespace orc +} // namespace llvm + +#endif diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp new file mode 100644 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp @@ -0,0 +1,317 @@ +#include "ThinLtoJIT.h" + +#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h" +#include "llvm/ExecutionEngine/Orc/CompileUtils.h" +#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" +#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" +#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h" +#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h" +#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" +#include "llvm/ExecutionEngine/SectionMemoryManager.h" +#include "llvm/Support/Debug.h" + +#include "ThinLtoDiscoveryThread.h" +#include "ThinLtoInstrumentationLayer.h" +#include "ThinLtoModuleIndex.h" + +#include +#include +#include + +#ifndef NDEBUG +#include +#endif + +#define DEBUG_TYPE "thinltojit" + +namespace llvm { +namespace orc { + +class ThinLtoDefinitionGenerator : public JITDylib::DefinitionGenerator { +public: + ThinLtoDefinitionGenerator(ThinLtoModuleIndex &GlobalIndex, + ThinLtoInstrumentationLayer &InstrumentationLayer, + ThinLtoJIT::AddModuleFunction AddModule, + char ManglePrefix, bool AllowNudge) + : GlobalIndex(GlobalIndex), InstrumentationLayer(InstrumentationLayer), + AddModule(std::move(AddModule)), ManglePrefix(ManglePrefix), + AllowNudgeIntoDiscovery(AllowNudge) {} + + Error tryToGenerate(LookupKind K, JITDylib &JD, + JITDylibLookupFlags JDLookupFlags, + const SymbolLookupSet &Symbols) override; + +private: + ThinLtoModuleIndex &GlobalIndex; + ThinLtoInstrumentationLayer &InstrumentationLayer; + ThinLtoJIT::AddModuleFunction AddModule; + char ManglePrefix; + bool AllowNudgeIntoDiscovery; + + // ThinLTO summaries encode unprefixed names. + StringRef stripGlobalManglePrefix(StringRef Symbol) const { + bool Strip = (ManglePrefix != '\0' && Symbol[0] == ManglePrefix); + return Strip ? StringRef(Symbol.data() + 1, Symbol.size() - 1) : Symbol; + } +}; + +Error ThinLtoDefinitionGenerator::tryToGenerate( + LookupKind K, JITDylib &JD, JITDylibLookupFlags JDLookupFlags, + const SymbolLookupSet &Symbols) { +#ifndef NDEBUG + unsigned Added = 0; +#endif + + std::vector NewDiscoveryRoots; + StringMap> SymbolNamesByModulePath; + + for (const auto &KV : Symbols) { + StringRef Name = stripGlobalManglePrefix(*KV.first); + auto Guid = GlobalValue::getGUID(Name); + + if (GlobalValueSummary *S = GlobalIndex.getSummary(Guid)) { + std::vector &Names = SymbolNamesByModulePath[S->modulePath()]; + Names.push_back(Name); + if (AllowNudgeIntoDiscovery && isa(S)) { + NewDiscoveryRoots.push_back(GlobalValue::getGUID(Name)); + } + } + } + + for (const auto &KV : SymbolNamesByModulePath) { + Expected> TSM = + GlobalIndex.parseNewModuleFromFile(KV.first()); + if (!TSM) { + // Parsing the module from disk failed, after we successfully obtained + // ValueInfos for its symbols from ThinLTO summaries. + return TSM.takeError(); + } + + // We did parse the module already, but the add request is waiting "outside" + // for the lock that will be freed once this request is done. There appears + // to be no way to temporarily suspend the request and get the symbol ready. + // Thus, we must parse the module here again and submit it before returning. + // It's quite expensive as we are actively blocking execution at this point. + if (!*TSM) { + InstrumentationLayer.getExecutionSession().reportError( + createStringError(inconvertibleErrorCode(), + "Module %s required for symbol %s was added while " + "request for it was in progress. Reparsing!", + KV.first().data(), KV.second.front().data())); + + static constexpr bool ForceLoad = true; + TSM = GlobalIndex.parseNewModuleFromFile(KV.first(), ForceLoad); + if (!TSM) { + // Parsing the module from disk failed. This may happen any time. + return TSM.takeError(); + } + assert(*TSM && "We forced the load operation"); + } + + if (Error LoadErr = AddModule(std::move(**TSM))) + // Found a module but failed to add it. + return LoadErr; + +#ifndef NDEBUG + ++Added; +#endif + } + + LLVM_DEBUG(dbgs() << "Generator: " << Added + << " new modules added synchronously\n"); + + // We can anticipate to run into the requested functions as soon as execution + // continues. Thus, we may trigger discovery flags for them already now to + // initiate discovery behind them. We will probably compile a few unnecessary + // things in this case. + if (!NewDiscoveryRoots.empty() && AllowNudgeIntoDiscovery) { + // The registration involves locking a mutex, so better do it in a + // separate thread. + std::thread( + [this](std::vector Rs) { + InstrumentationLayer.nudgeIntoDiscovery(std::move(Rs)); + }, + std::move(NewDiscoveryRoots)) + .detach(); + } + + return Error::success(); +} + +ThinLtoJIT::ThinLtoJIT( + ArrayRef ModuleFiles, StringRef MainFunctionName, + unsigned LookaheadLevels, unsigned NumCompileThreads, + unsigned DiscoveryFlagsPerBatch, + ThinLtoInstrumentationLayer::ExplicitMemoryBarrier MemFence, + bool AllowNudgeIntoDiscovery, bool LookupOnAdd, Error &Err) { + ErrorAsOutParameter ErrAsOutParam(&Err); + + GlobalIndex = std::make_unique(ES); + for (StringRef F : ModuleFiles) { + if (auto Err = GlobalIndex->add(F)) + ES.reportError(std::move(Err)); + } + + auto TSM = setupMainModule(MainFunctionName); + if (!TSM) { + Err = TSM.takeError(); + return; + } + + ThreadSafeModule MainModule = std::move(*TSM); + Module *RawModule = MainModule.getModuleUnlocked(); + + // Now that we know the target data layout we can setup the mangler. + Mangle.init(ES, RawModule); + MainFunctionMangled = Mangle(MainFunctionName); + + Err = setupLayers(Triple(RawModule->getTargetTriple()), + DiscoveryFlagsPerBatch, MemFence); + if (Err) + return; + + Err = setupDiscovery(NumCompileThreads, LookaheadLevels, LookupOnAdd); + if (Err) + return; + + MainJD = &ES.createJITDylib("main"); + Err = setupJITDylib(MainJD, AllowNudgeIntoDiscovery); + if (Err) + return; + + Err = AddModule(std::move(MainModule)); + if (Err) + return; + + if (AllowNudgeIntoDiscovery) { + auto MainFunctionGuid = GlobalValue::getGUID(MainFunctionName); + InstrumentationLayer->nudgeIntoDiscovery({MainFunctionGuid}); + } + +#ifndef NDEBUG + // Uncomment to give the discovery thread some time do things. + // std::this_thread::sleep_for(std::chrono::milliseconds(1000)); +#endif +} + +Expected ThinLtoJIT::setupMainModule(StringRef MainFunction) { + Optional M = GlobalIndex->getModulePathForSymbol(MainFunction); + if (!M) { + std::string Buffer; + raw_string_ostream OS(Buffer); + OS << "No ValueInfo for symbol '" << MainFunction; + OS << "' in provided modules: "; + for (StringRef P : GlobalIndex->getAllModulePaths()) + OS << P << " "; + OS << "\n"; + return createStringError(inconvertibleErrorCode(), OS.str()); + } + Expected> TSM = + GlobalIndex->parseNewModuleFromFile(*M); + if (!TSM) + return TSM.takeError(); + assert(*TSM && "This is the first module, it cannot exist yet"); + return std::move(**TSM); +} + +Error ThinLtoJIT::setupLayers( + Triple TT, unsigned DiscoveryFlagsPerBatch, + ThinLtoInstrumentationLayer::ExplicitMemoryBarrier MemFence) { + ObjLinkingLayer = std::make_unique( + ES, []() { return std::make_unique(); }); + CompileLayer = std::make_unique( + ES, *ObjLinkingLayer, ConcurrentIRCompiler(JITTargetMachineBuilder(TT))); + + InstrumentationLayer = std::make_unique( + ES, *CompileLayer, MemFence, DiscoveryFlagsPerBatch); + + auto ISMB = createLocalIndirectStubsManagerBuilder(TT); + auto LCTM = createLocalLazyCallThroughManager( + TT, ES, pointerToJITTargetAddress(exitOnLazyCallThroughFailure)); + if (!LCTM) + return LCTM.takeError(); + + CallThroughManager = std::move(*LCTM); + OnDemandLayer = std::make_unique( + ES, *InstrumentationLayer, *CallThroughManager, std::move(ISMB)); + // Don't break up modules. Insert stubs on module boundaries. + OnDemandLayer->setPartitionFunction(CompileOnDemandLayer::compileWholeModule); + + AddModule = [this](ThreadSafeModule TSM) -> Error { + assert(MainJD && "Setup MainJD JITDylib before calling"); + return OnDemandLayer->add(*MainJD, std::move(TSM)); + }; + + return Error::success(); +} + +Error ThinLtoJIT::setupDiscovery(unsigned NumCompileThreads, + unsigned LookaheadLevels, bool LookupOnAdd) { + // Delegate compilation to the thread pool. + CompileThreads = std::make_unique(NumCompileThreads); + ES.setDispatchMaterialization( + [this](JITDylib &JD, std::shared_ptr MU) { + CompileThreads->async([MU, &JD]() { MU->doMaterialize(JD); }); + }); + +#ifndef NDEBUG + // Uncomment to avoid discovering all at once when debugging small examples. + // LookaheadLevels = 1; +#endif + + // We can lookup one symbol right away to force immediately materialization. + auto AddModuleAndLookup = [this](ThreadSafeModule TSM) -> Error { + std::string FunctionNameForLookup = + TSM.getModuleUnlocked()->getFunctionList().front().getName().str(); + Error SubmitErr = AddModule(std::move(TSM)); + if (SubmitErr) + return SubmitErr; + // TODO: This is quite workaroundish. Could the CompileOnDemandLayer have a + // flag to force materialization without an extra lookup like this? + auto LookupRes = ES.lookup({MainJD}, Mangle(FunctionNameForLookup)); + if (!LookupRes) + return LookupRes.takeError(); + assert(LookupRes->getAddress() && "Function should be emitted now"); + return Error::success(); + }; + + // Spawn discovery thread and let it add newly discovered modules to the JIT. + JitRunning.store(true); + AddModuleFunction F = LookupOnAdd ? AddModuleAndLookup : AddModule; + DiscoveryThreadWorker = std::make_unique( + JitRunning, *InstrumentationLayer, *GlobalIndex, LookaheadLevels, + std::move(F)); + + std::thread(std::ref(*DiscoveryThreadWorker)).detach(); + return Error::success(); +} + +Error ThinLtoJIT::setupJITDylib(JITDylib *JD, bool AllowNudge) { + // Register symbols for C++ static destructors. + LocalCXXRuntimeOverrides CXXRuntimeoverrides; + Error Err = CXXRuntimeoverrides.enable(*JD, *Mangle.Impl); + if (Err) + return Err; + + // Lookup symbol names in the global ThinLTO module index first + char Prefix = Mangle.getGlobalPrefix(); + JD->addGenerator(std::make_unique( + *GlobalIndex, *InstrumentationLayer, AddModule, Prefix, AllowNudge)); + // Then try lookup in the host process. + auto HostLookup = DynamicLibrarySearchGenerator::GetForCurrentProcess(Prefix); + if (!HostLookup) + return HostLookup.takeError(); + JD->addGenerator(std::move(*HostLookup)); + + return Error::success(); +} + +ThinLtoJIT::~ThinLtoJIT() { + // Signal the DiscoveryThread to shut down. + JitRunning.store(false); + // Wait for potential compile actions to finish. + CompileThreads->wait(); +} + +} // namespace orc +} // namespace llvm diff --git a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h new file mode 100644 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h @@ -0,0 +1,47 @@ +#ifndef LLVM_EXAMPLES_THINLTOJIT_THINLTOJITMODULEINDEX_H +#define LLVM_EXAMPLES_THINLTOJIT_THINLTOJITMODULEINDEX_H + +#include "llvm/ADT/Optional.h" +#include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/ModuleSummaryIndex.h" +#include "llvm/Support/Error.h" + +#include +#include +#include +#include + +namespace llvm { +namespace orc { + +class SymbolStringPtr; + +class ThinLtoModuleIndex { + static constexpr bool HaveGVs = false; + +public: + ThinLtoModuleIndex(ExecutionSession &ES) + : ES(ES), CombinedSummaryIndex(HaveGVs), NextModuleId(0) {} + + Error add(StringRef ModulePath); + GlobalValueSummary *getSummary(GlobalValue::GUID Function) const; + std::vector getAllModulePaths() const; + Optional getModulePathForSymbol(StringRef Name) const; + Expected> + parseNewModuleFromFile(StringRef Path, bool ForceLoad = false); + +private: + ExecutionSession &ES; + ModuleSummaryIndex CombinedSummaryIndex; + uint64_t NextModuleId; + + std::mutex ParsedModulesLock; + std::set ParsedModules; +}; + +} // namespace orc +} // namespace llvm + +#endif diff --git a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp new file mode 100644 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp @@ -0,0 +1,94 @@ +#include "ThinLtoModuleIndex.h" + +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include + +#define DEBUG_TYPE "thinltojit" + +namespace llvm { +namespace orc { + +Error ThinLtoModuleIndex::add(StringRef ModulePath) { + auto Buffer = errorOrToExpected(MemoryBuffer::getFile(ModulePath)); + if (!Buffer) + return Buffer.takeError(); + + Error ParseErr = readModuleSummaryIndex((*Buffer)->getMemBufferRef(), + CombinedSummaryIndex, NextModuleId); + if (ParseErr) + return ParseErr; + + ++NextModuleId; + return Error::success(); +} + +std::vector ThinLtoModuleIndex::getAllModulePaths() const { + std::vector Paths; + for (const auto &KV : CombinedSummaryIndex.modulePaths()) { + StringRef Path = KV.first(); + Paths.push_back(Path); + } + return Paths; +} + +GlobalValueSummary * +ThinLtoModuleIndex::getSummary(GlobalValue::GUID Function) const { + ValueInfo VI = CombinedSummaryIndex.getValueInfo(Function); + if (!VI || VI.getSummaryList().empty()) + return nullptr; +#ifndef NDEBUG + if (VI.getSummaryList().size() > 1) { + LLVM_DEBUG(dbgs() << "SummaryList with multiple entries!\n"); + } +#endif + return VI.getSummaryList().front().get()->getBaseObject(); +} + +Optional +ThinLtoModuleIndex::getModulePathForSymbol(StringRef Name) const { + if (GlobalValueSummary *S = getSummary(GlobalValue::getGUID(Name))) + return S->modulePath(); + return None; // We don't know the symbol. +} + +Expected> +ThinLtoModuleIndex::parseNewModuleFromFile(StringRef Path, bool ForceLoad) { + if (!ForceLoad) { + std::lock_guard Lock(ParsedModulesLock); + + SymbolStringPtr PathId = ES.intern(Path); + auto It = ParsedModules.find(PathId); + if (It != ParsedModules.end() && !ForceLoad) + // This is not a new module. + return None; + + ParsedModules.insert(PathId); + } + + // TODO: make a SMDiagnosticError class for this + SMDiagnostic Err; + auto Ctx = std::make_unique(); + auto M = parseIRFile(Path, Err, *Ctx); + if (!M) { + std::string ErrDescription; + { + raw_string_ostream S(ErrDescription); + Err.print("ThinLtoJIT", S); + } + return createStringError(inconvertibleErrorCode(), + "Failed load module from file '%s' (%s)", + Path.data(), ErrDescription.c_str()); + } + + return ThreadSafeModule(std::move(M), std::move(Ctx)); +} + +} // namespace orc +} // namespace llvm diff --git a/llvm/examples/ThinLtoJIT/bench b/llvm/examples/ThinLtoJIT/bench new file mode 100755 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/bench @@ -0,0 +1,89 @@ +#!/bin/bash +#set -x + +if [ $# -gt 2 ]; then + TOOLS_DIR="$1" + SOURCE_DIR="$2" + MAIN_SOURCE_FILE="$3" +else + echo "Usage: bench
[]" + exit 1 +fi + +if [ $# -gt 3 ]; then + SYS_ROOT="$4" +else + SYS_ROOT="/" +fi + +function check_tool () +{ + if [ -e "${TOOLS_DIR}/$1" ]; then + echo "Found: $1" + else + echo "!!! Cannot find required tool, please provide it in the LLVM binaries folder: $1" + fi +} + +check_tool lli +check_tool SpeculativeJIT +check_tool ThinLtoJIT + +SKIP_BITCODE_GEN=0 +if [[ -e bc-default || -e bc-thinlto || -e ll-default || -e ll-thinlto ]]; then + echo "Skipping bitcode generation: output directories existing" + echo "Please clean up manually: rm -R bc-default bc-thinlto ll-default ll-thinlto" + SKIP_BITCODE_GEN=1 +else + check_tool clang + check_tool llvm-dis + mkdir bc-default + mkdir bc-thinlto + mkdir ll-default + mkdir ll-thinlto +fi + +ROOT_DIR=$(pwd) +ALL_BITCODE_FILES="" + +MAIN_FILE_BASENAME=$(basename "${MAIN_SOURCE_FILE%.c*}") +LLI_EXTRA_MODULES="" + +for f in ${SOURCE_DIR}/*.c* ; do + BASE_NAME=$(basename "${f%.c*}") + + if [ ${SKIP_BITCODE_GEN} -eq 0 ]; then + echo "Compile: $f -> ${BASE_NAME}.bc" + + "${TOOLS_DIR}/clang" -c -I ${SOURCE_DIR} -isysroot ${SYS_ROOT} -emit-llvm \ + -o "bc-default/${BASE_NAME}.bc" "$f" + "${TOOLS_DIR}/clang" -c -I ${SOURCE_DIR} -isysroot ${SYS_ROOT} -flto=thin \ + -o "bc-thinlto/${BASE_NAME}.bc" "$f" + + echo "Disassemble ${BASE_NAME}.bc -> ${BASE_NAME}.ll" + ${TOOLS_DIR}/llvm-dis bc-default/${BASE_NAME}.bc -o ll-default/${BASE_NAME}.ll + ${TOOLS_DIR}/llvm-dis bc-thinlto/${BASE_NAME}.bc -o ll-thinlto/${BASE_NAME}.ll + fi + + ALL_BITCODE_FILES="${ALL_BITCODE_FILES} ${BASE_NAME}.bc" + if [ "${BASE_NAME}" != "${MAIN_FILE_BASENAME}" ]; then + LLI_EXTRA_MODULES="${LLI_EXTRA_MODULES} -extra-module=${BASE_NAME}.bc" + fi +done + +set -x +cd ${ROOT_DIR}/bc-default +time (${TOOLS_DIR}/clang -o ${MAIN_FILE_BASENAME} -O0 ${ALL_BITCODE_FILES} && ./${MAIN_FILE_BASENAME} 1>/dev/null) +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=mcjit "${MAIN_FILE_BASENAME}.bc" 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-mcjit "${MAIN_FILE_BASENAME}.bc" 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy "${MAIN_FILE_BASENAME}.bc" 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -compile-threads=2 "${MAIN_FILE_BASENAME}.bc" 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy "${MAIN_FILE_BASENAME}.bc" 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=2 "${MAIN_FILE_BASENAME}.bc" 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=2 -O1 "${MAIN_FILE_BASENAME}.bc" 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=2 -O0 "${MAIN_FILE_BASENAME}.bc" 1>/dev/null +time ${TOOLS_DIR}/SpeculativeJIT -num-threads=2 ${ALL_BITCODE_FILES} 1>/dev/null + +cd ${ROOT_DIR}/bc-thinlto +#time (${TOOLS_DIR}/clang -flto=thin -o test ${ALL_BITCODE_FILES} && ./test 1>/dev/null) +time ${TOOLS_DIR}/ThinLtoJIT -compile-threads=2 ${ALL_BITCODE_FILES} 1>/dev/null diff --git a/llvm/examples/ThinLtoJIT/main.cpp b/llvm/examples/ThinLtoJIT/main.cpp new file mode 100644 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/main.cpp @@ -0,0 +1,85 @@ +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/TargetSelect.h" + +#include "ThinLtoInstrumentationLayer.h" +#include "ThinLtoJIT.h" + +#include +#include + +using namespace llvm; + +static cl::list InputFiles(cl::Positional, cl::OneOrMore, + cl::desc("")); + +static cl::list InputArgs("args", cl::Positional, + cl::desc("..."), + cl::ZeroOrMore, cl::PositionalEatsArgs); + +static cl::opt CompileThreads("compile-threads", cl::Optional, + cl::desc("Number of compile threads"), + cl::init(4)); + +static cl::opt + LookaheadLevels("lookahead", cl::Optional, + cl::desc("Number of calls to look ahead of execution"), + cl::init(4)); + +static cl::opt DiscoveryFlagsBatchSize( + "discovery-flag-batch-size", cl::Optional, + cl::desc("Number of discovery flags allocated in one go"), cl::init(4096)); + +static cl::opt + MemFence( + "mem-fence", + cl::desc( + "Choose where to install memory fences for cache synchronization"), + cl::init(orc::ThinLtoInstrumentationLayer::Always), + cl::values(clEnumValN(orc::ThinLtoInstrumentationLayer::Never, "never", + "No use of memory fences"), + clEnumValN(orc::ThinLtoInstrumentationLayer::StaticCode, + "static", + "Use of memory fences in static code only"), + clEnumValN(orc::ThinLtoInstrumentationLayer::JITedCode, + "jited", + "Install memory fences in JITed code only"), + clEnumValN(orc::ThinLtoInstrumentationLayer::Always, + "always", "Always use of memory fences"))); + +static cl::opt + AllowNudge("allow-nudge", + cl::desc("Allow the symbol generator to nudge symbols into " + "discovery although they haven't been reached"), + cl::init(true)); + +static cl::opt + LookupOnAdd("lookup-on-add", + cl::desc("Issue an artificial lookup to force immediate " + "materialization of submitted modules"), + cl::init(true)); + +int main(int argc, char *argv[]) { + InitLLVM X(argc, argv); + InitializeNativeTarget(); + InitializeNativeTargetAsmPrinter(); + cl::ParseCommandLineOptions(argc, argv, "ThinLtoJIT"); + + Error Err = Error::success(); + auto atLeastOne = [](unsigned N) { return std::max(1u, N); }; + + orc::ThinLtoJIT Jit(InputFiles, "main", LookaheadLevels, + atLeastOne(CompileThreads), DiscoveryFlagsBatchSize, + MemFence, LookupOnAdd, AllowNudge, Err); + if (Err) { + logAllUnhandledErrors(std::move(Err), errs(), "ThinLtoJIT: "); + exit(1); + } + + ExitOnError ExitOnErr; + ExitOnErr.setBanner("ThinLtoJIT: "); + + return ExitOnErr(Jit.main(InputArgs)); +}