diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -161,6 +161,8 @@ "invalid Xarch argument: '%0', options requiring arguments are unsupported">; def err_drv_Xopenmp_target_missing_triple : Error< "cannot deduce implicit triple value for -Xopenmp-target, specify triple using -Xopenmp-target=">; +def err_drv_openmp_jit_without_lto : Error< + "cannot enable OpenMP offloading JIT, specify bitcode compilation with '-foffload-lto'">; def err_drv_invalid_Xopenmp_target_with_args : Error< "invalid -Xopenmp-target argument: '%0', options requiring arguments are unsupported">; def err_drv_argument_only_allowed_with : Error< diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2460,6 +2460,10 @@ Group, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">, Group, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; +def fopenmp_target_jit : Flag<["-"], "fopenmp-target-jit">, Group, + HelpText<"Enable JIT comilation for OpenMP Offloading">, Flags<[ NoArgumentUnused]>; +def fno_openmp_target_jit : Flag<["-"], "fno-openmp-target-jit">, Group, + Flags<[NoArgumentUnused, HelpHidden]>; defm openmp_target_new_runtime: BoolFOption<"openmp-target-new-runtime", LangOpts<"OpenMPTargetNewRuntime">, DefaultTrue, PosFlag, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8133,6 +8133,12 @@ const char *LinkingOutput) const { ArgStringList CmdArgs; + if (!C.getDriver().isUsingLTO(/* IsOffload */ true) && + Args.hasFlag(options::OPT_fopenmp_target_jit, + options::OPT_fno_openmp_target_jit, /*Default*/ false)) { + C.getDriver().Diag(clang::diag::err_drv_openmp_jit_without_lto); + } + if (getToolChain().getDriver().isUsingLTO(/* IsOffload */ true)) { // Pass in target features for each toolchain. auto OpenMPTCRange = C.getOffloadToolChains(); @@ -8192,6 +8198,11 @@ if (!OOpt.empty()) CmdArgs.push_back(Args.MakeArgString(Twine("-opt-level=O") + OOpt)); } + + if (Args.hasFlag(options::OPT_fopenmp_target_jit, + options::OPT_fno_openmp_target_jit, + /*Default=*/false)) + CmdArgs.push_back(Args.MakeArgString("-target-embed-bc")); } // Construct the link job so we can wrap around it. @@ -8200,6 +8211,7 @@ CmdArgs.push_back("-linker-path"); CmdArgs.push_back(LinkCommand->getExecutable()); + CmdArgs.push_back("--"); for (const char *LinkArg : LinkCommand->getArguments()) CmdArgs.push_back(LinkArg); diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -76,12 +76,18 @@ cl::desc("Path for the target bitcode library"), cl::cat(ClangLinkerWrapperCategory)); +static cl::opt EmbedBC( + "target-embed-bc", cl::ZeroOrMore, + cl::desc("Embed linked bitcode instead of an executable device image."), + cl::init(false), cl::cat(ClangLinkerWrapperCategory)); + // Do not parse linker options. static cl::list - HostLinkerArgs(cl::Sink, cl::desc("...")); + HostLinkerArgs(cl::Positional, + cl::desc("...")); /// Path of the current binary. -static std::string LinkerExecutable; +static const char *LinkerExecutable; /// Temporary files created by the linker wrapper. static SmallVector TempFiles; @@ -422,8 +428,8 @@ std::unique_ptr Buffer = MemoryBuffer::getMemBuffer(Library.getMemoryBufferRef(), false); - if (Error Err = writeArchive(TempFile, Members, true, Library.kind(), - true, Library.isThin(), std::move(Buffer))) + if (Error Err = writeArchive(TempFile, Members, true, Library.kind(), true, + Library.isThin(), std::move(Buffer))) return std::move(Err); return static_cast(TempFile); @@ -500,7 +506,7 @@ return static_cast(TempFile); } -Expected link(ArrayRef InputFiles, +Expected link(ArrayRef InputFiles, ArrayRef LinkerArgs, Triple TheTriple, StringRef Arch) { // NVPTX uses the nvlink binary to link device object files. @@ -534,7 +540,7 @@ CmdArgs.push_back(Arg); // Add extracted input files. - for (auto Input : InputFiles) + for (StringRef Input : InputFiles) CmdArgs.push_back(Input); if (sys::ExecuteAndWait(*NvlinkPath, CmdArgs)) @@ -544,7 +550,7 @@ } } // namespace nvptx -Expected linkDevice(ArrayRef InputFiles, +Expected linkDevice(ArrayRef InputFiles, ArrayRef LinkerArgs, Triple TheTriple, StringRef Arch) { switch (TheTriple.getArch()) { @@ -611,8 +617,10 @@ llvm_unreachable("Invalid optimization level"); } -std::unique_ptr createLTO(const Triple &TheTriple, StringRef Arch, - bool WholeProgram) { +template > +std::unique_ptr createLTO( + const Triple &TheTriple, StringRef Arch, bool WholeProgram, + ModuleHook Hook = [](size_t, const Module &) { return true; }) { lto::Config Conf; lto::ThinBackend Backend; // TODO: Handle index-only thin-LTO @@ -631,7 +639,7 @@ Conf.PTO.LoopVectorization = Conf.OptLevel > 1; Conf.PTO.SLPVectorization = Conf.OptLevel > 1; - // TODO: Handle outputting bitcode using a module hook. + Conf.PostInternalizeModuleHook = Hook; if (TheTriple.isNVPTX()) Conf.CGFileType = CGFT_AssemblyFile; else @@ -651,11 +659,11 @@ [](char C) { return C == '_' || isAlnum(C); }); } -Expected> linkBitcodeFiles(ArrayRef InputFiles, - const Triple &TheTriple, - StringRef Arch) { +Error linkBitcodeFiles(SmallVectorImpl &InputFiles, + const Triple &TheTriple, StringRef Arch) { SmallVector, 4> SavedBuffers; SmallVector, 4> BitcodeFiles; + SmallVector NewInputFiles; StringMap UsedInRegularObj; // Search for bitcode files in the input and create an LTO input file. If it @@ -674,6 +682,7 @@ if (!ObjFile) return ObjFile.takeError(); + NewInputFiles.push_back(File.str()); for (auto &Sym : (*ObjFile)->symbols()) { Expected Name = Sym.getName(); if (!Name) @@ -693,12 +702,36 @@ } if (BitcodeFiles.empty()) - return None; + return Error::success(); + + auto HandleError = [&](std::error_code EC) { + logAllUnhandledErrors(errorCodeToError(EC), + WithColor::error(errs(), LinkerExecutable)); + exit(1); + }; + + // LTO Module hook to output bitcode without running the backend. + auto LinkOnly = [&](size_t Task, const Module &M) { + SmallString<128> TempFile; + if (std::error_code EC = sys::fs::createTemporaryFile( + "jit-" + TheTriple.getTriple(), "bc", TempFile)) + HandleError(EC); + std::error_code EC; + raw_fd_ostream LinkedBitcode(TempFile, EC, sys::fs::OF_None); + if (EC) + HandleError(EC); + WriteBitcodeToFile(M, LinkedBitcode); + TempFiles.push_back(static_cast(TempFile)); + NewInputFiles.push_back(static_cast(TempFile)); + return false; + }; // We have visibility of the whole program if every input is bitcode, all // inputs are statically linked so there should be no external references. bool WholeProgram = BitcodeFiles.size() == InputFiles.size(); - StringMap PrevailingSymbols; + auto LTOBackend = (EmbedBC) + ? createLTO(TheTriple, Arch, WholeProgram, LinkOnly) + : createLTO(TheTriple, Arch, WholeProgram); // TODO: Run more tests to verify that this is correct. // Create the LTO instance with the necessary config and add the bitcode files @@ -708,7 +741,7 @@ // 2. We do not support relocatable object files. // 3. All inputs are relocatable object files extracted from host binaries, so // there is no resolution to a dynamic library. - auto LTOBackend = createLTO(TheTriple, Arch, WholeProgram); + StringMap PrevailingSymbols; for (auto &BitcodeFile : BitcodeFiles) { const auto Symbols = BitcodeFile->symbols(); SmallVector Resolutions(Symbols.size()); @@ -757,16 +790,18 @@ StringRef Extension = (TheTriple.isNVPTX()) ? "s" : "o"; if (std::error_code EC = sys::fs::createTemporaryFile( "lto-" + TheTriple.getTriple(), Extension, FD, TempFile)) - return nullptr; + HandleError(EC); TempFiles.push_back(static_cast(TempFile)); return std::make_unique( std::make_unique(FD, true)); }; + if (Error Err = LTOBackend->run(AddStream)) return std::move(Err); + // Is we are compiling for NVPTX we need to run the assembler first. for (auto &File : Files) { - if (!TheTriple.isNVPTX()) + if (!TheTriple.isNVPTX() || EmbedBC) continue; auto FileOrErr = nvptx::assemble(File, TheTriple, Arch); @@ -775,7 +810,12 @@ File = *FileOrErr; } - return static_cast(Files.front()); + // Append the new inputs to the device linker input. + for (auto &File : Files) + NewInputFiles.push_back(static_cast(File)); + InputFiles = NewInputFiles; + + return Error::success(); } /// Runs the appropriate linking action on all the device files specified in \p @@ -784,7 +824,7 @@ ArrayRef LinkerArgs, SmallVectorImpl &LinkedImages) { // Get the list of inputs for a specific device. - StringMap> LinkerInputMap; + StringMap> LinkerInputMap; for (auto &DeviceFile : DeviceFiles) LinkerInputMap[DeviceFile.str()].push_back(DeviceFile.Filename); @@ -794,13 +834,16 @@ Triple TheTriple(TargetFeatures.first); StringRef Arch(TargetFeatures.second); - // TODO: Run LTO or bitcode linking before the final link job. - auto ObjectOrErr = - linkBitcodeFiles(LinkerInput.getValue(), TheTriple, Arch); - if (!ObjectOrErr) - return ObjectOrErr.takeError(); - if ((*ObjectOrErr).hasValue()) - LinkerInput.getValue() = {**ObjectOrErr}; + // Run LTO on any bitcode files and replace the input with the result. + if (Error Err = linkBitcodeFiles(LinkerInput.getValue(), TheTriple, Arch)) + return std::move(Err); + + // If we are embedding bitcode for JIT, skip the final device linking. + if (EmbedBC) { + assert(!LinkerInput.getValue().empty() && "No bitcode image to embed"); + LinkedImages.push_back(LinkerInput.getValue().front()); + continue; + } auto ImageOrErr = linkDevice(LinkerInput.getValue(), LinkerArgs, TheTriple, Arch);