diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -38,5 +38,4 @@ // RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \ // RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LTO -// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 -c {{.*}}.s -// LTO: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.cubin +// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 {{.*}}.s diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -595,7 +595,7 @@ // TODO: Move these to a separate file. namespace nvptx { Expected assemble(StringRef InputFile, Triple TheTriple, - StringRef Arch) { + StringRef Arch, bool RDC = true) { // NVPTX uses the ptxas binary to create device object files. Expected PtxasPath = findProgram("ptxas", {CudaBinaryPath}); if (!PtxasPath) @@ -626,7 +626,8 @@ CmdArgs.push_back(Opt); CmdArgs.push_back("--gpu-name"); CmdArgs.push_back(Arch); - CmdArgs.push_back("-c"); + if (RDC) + CmdArgs.push_back("-c"); CmdArgs.push_back(InputFile); @@ -933,7 +934,8 @@ } Error linkBitcodeFiles(SmallVectorImpl &InputFiles, - const Triple &TheTriple, StringRef Arch) { + const Triple &TheTriple, StringRef Arch, + bool &WholeProgram) { SmallVector, 4> SavedBuffers; SmallVector, 4> BitcodeFiles; SmallVector NewInputFiles; @@ -1009,7 +1011,7 @@ }; // We assume visibility of the whole program if every input file was bitcode. - bool WholeProgram = BitcodeFiles.size() == InputFiles.size(); + WholeProgram = BitcodeFiles.size() == InputFiles.size(); auto LTOBackend = (EmbedBitcode) ? createLTO(TheTriple, Arch, WholeProgram, OutputBitcode) : createLTO(TheTriple, Arch, WholeProgram); @@ -1089,7 +1091,7 @@ // Is we are compiling for NVPTX we need to run the assembler first. if (TheTriple.isNVPTX() && !EmbedBitcode) { for (auto &File : Files) { - auto FileOrErr = nvptx::assemble(File, TheTriple, Arch); + auto FileOrErr = nvptx::assemble(File, TheTriple, Arch, !WholeProgram); if (!FileOrErr) return FileOrErr.takeError(); File = *FileOrErr; @@ -1117,10 +1119,11 @@ for (auto &LinkerInput : LinkerInputMap) { DeviceFile &File = LinkerInput.getFirst(); Triple TheTriple = Triple(File.TheTriple); + bool WholeProgram = false; // Run LTO on any bitcode files and replace the input with the result. - if (Error Err = - linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, File.Arch)) + if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, + File.Arch, WholeProgram)) return Err; // If we are embedding bitcode for JIT, skip the final device linking. @@ -1130,6 +1133,14 @@ continue; } + // If we performed LTO on NVPTX and had whole program visibility, we can use + // CUDA in non-RDC mode. + if (WholeProgram && TheTriple.isNVPTX()) { + assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed"); + LinkedImages.push_back(LinkerInput.getSecond().front()); + continue; + } + auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch); if (!ImageOrErr) return ImageOrErr.takeError();