diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -285,6 +285,10 @@ if (DriverArgs.hasArg(options::OPT_nogpulib)) return; + // Link the bitcode library late if we're using device LTO. + if (getDriver().isUsingLTO(/* IsOffload */ true)) + return; + std::string BitcodeSuffix; if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime, options::OPT_fno_openmp_target_new_runtime, true)) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8164,6 +8164,34 @@ "-target-feature=" + TC->getTripleString() + "=" + *(FeatureIt + 1))); } + // Pass in the bitcode library to be linked during LTO. + for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE; + ++TI) { + const ToolChain *TC = TI->second; + const Driver &D = TC->getDriver(); + const ArgList &TCArgs = C.getArgsForToolChain(TC, "", Action::OFK_OpenMP); + StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ); + + std::string BitcodeSuffix; + if (TCArgs.hasFlag(options::OPT_fopenmp_target_new_runtime, + options::OPT_fno_openmp_target_new_runtime, true)) + BitcodeSuffix += "new-"; + if (TC->getTriple().isNVPTX()) + BitcodeSuffix += "nvptx-"; + else if (TC->getTriple().isAMDGPU()) + BitcodeSuffix += "amdgpu-"; + BitcodeSuffix += Arch; + + ArgStringList BitcodeLibrary; + addOpenMPDeviceRTL(D, TCArgs, BitcodeLibrary, BitcodeSuffix, + TC->getTriple()); + + if (!BitcodeLibrary.empty()) + CmdArgs.push_back( + Args.MakeArgString("-target-library=" + TC->getTripleString() + + "-" + Arch + "=" + BitcodeLibrary.back())); + } + // Pass in the optimization level to use for LTO. if (const Arg *A = Args.getLastArg(options::OPT_O_Group)) { StringRef OOpt; diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -744,6 +744,10 @@ return; } + // Link the bitcode library late if we're using device LTO. + if (getDriver().isUsingLTO(/* IsOffload */ true)) + return; + std::string BitcodeSuffix; if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime, options::OPT_fno_openmp_target_new_runtime, true)) diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -68,9 +68,14 @@ static cl::opt OptLevel("opt-level", cl::desc("Optimization level for LTO"), - cl::init("O0"), + cl::init("O2"), cl::cat(ClangLinkerWrapperCategory)); +static cl::opt + BitcodeLibrary("target-library", + cl::desc("Path for the target bitcode library"), + cl::cat(ClangLinkerWrapperCategory)); + // Do not parse linker options. static cl::list HostLinkerArgs(cl::Sink, cl::desc("...")); @@ -197,7 +202,7 @@ std::unique_ptr Output = std::move(*OutputOrErr); std::copy(Contents->begin(), Contents->end(), Output->getBufferStart()); if (Error E = Output->commit()) - return E; + return std::move(E); DeviceFiles.emplace_back(DeviceTriple, Arch, TempFile); ToBeStripped.push_back(*Name); @@ -225,7 +230,7 @@ std::unique_ptr Output = std::move(*OutputOrErr); std::copy(Contents.begin(), Contents.end(), Output->getBufferStart()); if (Error E = Output->commit()) - return E; + return std::move(E); StripFile = TempFile; } @@ -307,7 +312,7 @@ std::unique_ptr Output = std::move(*OutputOrErr); std::copy(Contents.begin(), Contents.end(), Output->getBufferStart()); if (Error E = Output->commit()) - return E; + return std::move(E); DeviceFiles.emplace_back(DeviceTriple, Arch, TempFile); ToBeDeleted.push_back(&GV); @@ -318,7 +323,7 @@ // We need to materialize the lazy module before we make any changes. if (Error Err = M->materializeAll()) - return Err; + return std::move(Err); // Remove the global from the module and write it to a new file. for (GlobalVariable *GV : ToBeDeleted) { @@ -392,7 +397,7 @@ } if (Err) - return Err; + return std::move(Err); if (!NewMembers) return None; @@ -406,9 +411,9 @@ std::unique_ptr Buffer = MemoryBuffer::getMemBuffer(Library.getMemoryBufferRef(), false); - if (Error WriteErr = writeArchive(TempFile, Members, true, Library.kind(), + if (Error Err = writeArchive(TempFile, Members, true, Library.kind(), true, Library.isThin(), std::move(Buffer))) - return WriteErr; + return std::move(Err); return static_cast(TempFile); } @@ -726,7 +731,7 @@ // Add the bitcode file with its resolved symbols to the LTO job. if (Error Err = LTOBackend->add(std::move(BitcodeFile), Resolutions)) - return Err; + return std::move(Err); } // Run the LTO job to compile the bitcode. @@ -744,7 +749,7 @@ std::make_unique(FD, true)); }; if (Error Err = LTOBackend->run(AddStream)) - return Err; + return std::move(Err); for (auto &File : Files) { if (!TheTriple.isNVPTX()) @@ -957,6 +962,17 @@ } } + // Add the device bitcode library to the device files if it was passed in. + if (!BitcodeLibrary.empty()) { + // FIXME: Hacky workaround to avoid a backend crash at O0. + if (OptLevel[1] - '0' == 0) + OptLevel[1] = '1'; + auto DeviceAndPath = StringRef(BitcodeLibrary).split('='); + auto TripleAndArch = DeviceAndPath.first.rsplit('-'); + DeviceFiles.emplace_back(TripleAndArch.first, TripleAndArch.second, + DeviceAndPath.second); + } + // Link the device images extracted from the linker input. SmallVector LinkedImages; if (Error Err = linkDeviceFiles(DeviceFiles, LinkerArgs, LinkedImages))