diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -60,8 +60,6 @@ "cannot find libdevice for %0; provide path to different CUDA installation " "via '--cuda-path', or pass '-nocudalib' to build without linking with " "libdevice">; -def err_drv_no_rdc_new_driver : Error< - "Using '--offload-new-driver' requires '-fgpu-rdc'">; def err_drv_no_rocm_device_lib : Error< "cannot find ROCm device library%select{| for %1|for ABI version %1}0; provide its path via " diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -212,8 +212,7 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), TheModule(CGM.getModule()), - RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode || - CGM.getLangOpts().OffloadingNewDriver), + RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), DeviceMC(InitDeviceMC(CGM)) { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); @@ -1172,10 +1171,11 @@ } return nullptr; } - if (!(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)) + if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode) + createOffloadingEntries(); + else return makeModuleCtorFunction(); - createOffloadingEntries(); return nullptr; } diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -2930,7 +2930,7 @@ return false; Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, - options::OPT_fno_gpu_rdc, /*Default=*/false); + options::OPT_fno_gpu_rdc, /*Default=*/false); const ToolChain *HostTC = C.getSingleOffloadToolChain(); assert(HostTC && "No toolchain for host compilation."); @@ -4486,11 +4486,23 @@ if (offloadDeviceOnly()) return C.MakeAction(DDeps, types::TY_Nothing); - Action *OffloadPackager = - C.MakeAction(OffloadActions, types::TY_Image); OffloadAction::DeviceDependences DDep; - DDep.add(*OffloadPackager, *C.getSingleOffloadToolChain(), - nullptr, Action::OFK_None); + if (C.isOffloadingHostKind(Action::OFK_Cuda) && + !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) { + // If we are not in RDC-mode we just emit the final CUDA fatbinary for each + // translation unit without requiring any linking. + Action *FatbinAction = + C.MakeAction(OffloadActions, types::TY_CUDA_FATBIN); + DDep.add(*FatbinAction, *C.getSingleOffloadToolChain(), + nullptr, Action::OFK_Cuda); + } else { + // Package all the offloading actions into a single output that can be + // embedded in the host and linked. + Action *PackagerAction = + C.MakeAction(OffloadActions, types::TY_Image); + DDep.add(*PackagerAction, *C.getSingleOffloadToolChain(), + nullptr, Action::OFK_None); + } OffloadAction::HostDependence HDep( *HostAction, *C.getSingleOffloadToolChain(), /*BoundArch=*/nullptr, isa(HostAction) ? DDep : DDeps); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4441,12 +4441,14 @@ Args.hasFlag(options::OPT_offload_new_driver, options::OPT_no_offload_new_driver, false)); + bool IsRDCMode = + Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false); bool IsUsingLTO = D.isUsingLTO(IsDeviceOffloadAction); auto LTOMode = D.getLTOMode(IsDeviceOffloadAction); // A header module compilation doesn't have a main input file, so invent a // fake one as a placeholder. - const char *ModuleName = [&]{ + const char *ModuleName = [&] { auto *ModuleNameArg = Args.getLastArg(options::OPT_fmodule_name_EQ); return ModuleNameArg ? ModuleNameArg->getValue() : ""; }(); @@ -6304,10 +6306,7 @@ } if (IsCuda || IsHIP) { - if (!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) && - Args.hasArg(options::OPT_offload_new_driver)) - D.Diag(diag::err_drv_no_rdc_new_driver); - if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) + if (IsRDCMode) CmdArgs.push_back("-fgpu-rdc"); if (Args.hasFlag(options::OPT_fgpu_defer_diag, options::OPT_fno_gpu_defer_diag, false)) @@ -6979,11 +6978,22 @@ } } - // Host-side cuda compilation receives all device-side outputs in a single - // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary. + // Host-side offloading compilation receives all device-side outputs. Include + // them in the host compilation depending on the target. If the host inputs + // are not empty we use the new-driver scheme, otherwise use the old scheme. if ((IsCuda || IsHIP) && CudaDeviceInput) { + CmdArgs.push_back("-fcuda-include-gpubinary"); + CmdArgs.push_back(CudaDeviceInput->getFilename()); + } else if (!HostOffloadingInputs.empty()) { + if (IsCuda && !IsRDCMode) { + assert(HostOffloadingInputs.size() == 1 && "Only one input expected"); CmdArgs.push_back("-fcuda-include-gpubinary"); - CmdArgs.push_back(CudaDeviceInput->getFilename()); + CmdArgs.push_back(HostOffloadingInputs.front().getFilename()); + } else { + for (const InputInfo Input : HostOffloadingInputs) + CmdArgs.push_back(Args.MakeArgString("-fembed-offload-object=" + + TC.getInputFilename(Input))); + } } if (IsCuda) { @@ -7032,12 +7042,6 @@ } } - // Host-side offloading recieves the device object files and embeds it in a - // named section including the associated target triple and architecture. - for (const InputInfo Input : HostOffloadingInputs) - CmdArgs.push_back(Args.MakeArgString("-fembed-offload-object=" + - TC.getInputFilename(Input))); - if (Triple.isAMDGPU()) { handleAMDGPUCodeObjectVersionOptions(D, Args, CmdArgs); diff --git a/clang/test/CodeGenCUDA/offloading-entries.cu b/clang/test/CodeGenCUDA/offloading-entries.cu --- a/clang/test/CodeGenCUDA/offloading-entries.cu +++ b/clang/test/CodeGenCUDA/offloading-entries.cu @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --global-value-regex ".omp_offloading.entry.*" -// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu \ +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -fgpu-rdc \ // RUN: --offload-new-driver -emit-llvm -o - -x cuda %s | FileCheck \ // RUN: --check-prefix=CUDA %s -// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu \ +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -fgpu-rdc \ // RUN: --offload-new-driver -emit-llvm -o - -x hip %s | FileCheck \ // RUN: --check-prefix=HIP %s diff --git a/clang/test/Driver/cuda-omp-unsupported-debug-options.cu b/clang/test/Driver/cuda-omp-unsupported-debug-options.cu --- a/clang/test/Driver/cuda-omp-unsupported-debug-options.cu +++ b/clang/test/Driver/cuda-omp-unsupported-debug-options.cu @@ -25,34 +25,34 @@ // Same tests for OpenMP // RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \ -// RUN: -g -gz 2>&1 | FileCheck %s --check-prefixes WARN,COMMON +// RUN: -fgpu-rdc -g -gz 2>&1 | FileCheck %s --check-prefixes WARN,COMMON // RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \ -// RUN: -gdwarf -fdebug-info-for-profiling 2>&1 | FileCheck %s --check-prefixes WARN,COMMON +// RUN: -fgpu-rdc -gdwarf -fdebug-info-for-profiling 2>&1 | FileCheck %s --check-prefixes WARN,COMMON // RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \ -// RUN: -gdwarf-2 -gsplit-dwarf 2>&1 | FileCheck %s --check-prefixes WARN,COMMON +// RUN: -fgpu-rdc -gdwarf-2 -gsplit-dwarf 2>&1 | FileCheck %s --check-prefixes WARN,COMMON // RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \ -// RUN: -gdwarf-3 -glldb 2>&1 | FileCheck %s --check-prefixes WARN,COMMON +// RUN: -fgpu-rdc -gdwarf-3 -glldb 2>&1 | FileCheck %s --check-prefixes WARN,COMMON // RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \ -// RUN: -gdwarf-4 -gcodeview 2>&1 | FileCheck %s --check-prefixes WARN,COMMON +// RUN: -fgpu-rdc -gdwarf-4 -gcodeview 2>&1 | FileCheck %s --check-prefixes WARN,COMMON // RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \ -// RUN: -gdwarf-5 -gmodules 2>&1 | FileCheck %s --check-prefixes WARN,COMMON +// RUN: -fgpu-rdc -gdwarf-5 -gmodules 2>&1 | FileCheck %s --check-prefixes WARN,COMMON // RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \ -// RUN: -ggdb1 -fdebug-macro 2>&1 | FileCheck %s --check-prefixes WARN,COMMON +// RUN: -fgpu-rdc -ggdb1 -fdebug-macro 2>&1 | FileCheck %s --check-prefixes WARN,COMMON // RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \ -// RUN: -ggdb2 -ggnu-pubnames 2>&1 | FileCheck %s --check-prefixes WARN,COMMON +// RUN: -fgpu-rdc -ggdb2 -ggnu-pubnames 2>&1 | FileCheck %s --check-prefixes WARN,COMMON // RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \ -// RUN: -ggdb3 -gdwarf-aranges 2>&1 | FileCheck %s --check-prefixes WARN,COMMON +// RUN: -fgpu-rdc -ggdb3 -gdwarf-aranges 2>&1 | FileCheck %s --check-prefixes WARN,COMMON // RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \ -// RUN: -g -gcolumn-info -fdebug-types-section 2>&1 | FileCheck %s --check-prefixes WARN,COMMON +// RUN: -fgpu-rdc -g -gcolumn-info -fdebug-types-section 2>&1 | FileCheck %s --check-prefixes WARN,COMMON // RUN: %clang -### -target x86_64-linux-gnu -c %s -gdwarf-5 -gembed-source 2>&1 \ // RUN: | FileCheck %s --check-prefixes WARN-GES,COMMON // RUN: %clang -### -target x86_64-linux-gnu -c %s -ggdb -gembed-source -gdwarf-5 2>&1 \ // RUN: | FileCheck %s --check-prefixes WARN-GES,COMMON // RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \ -// RUN: -gdwarf-5 -gembed-source 2>&1 | FileCheck %s --check-prefixes WARN-GES,COMMON +// RUN: -fgpu-rdc -gdwarf-5 -gembed-source 2>&1 | FileCheck %s --check-prefixes WARN-GES,COMMON // RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \ -// RUN: -ggdb -gembed-source -gdwarf-5 2>&1 | FileCheck %s --check-prefixes WARN-GES,COMMON +// RUN: -fgpu-rdc -ggdb -gembed-source -gdwarf-5 2>&1 | FileCheck %s --check-prefixes WARN-GES,COMMON // COMMON: warning: debug information option '{{-gz|-fdebug-info-for-profiling|-gsplit-dwarf|-glldb|-gcodeview|-gmodules|-gembed-source|-fdebug-macro|-ggnu-pubnames|-gdwarf-aranges|-fdebug-types-section}}' is not supported // WARN-SAME: for target 'nvptx64-nvidia-cuda' [-Wunsupported-target-opt] diff --git a/clang/test/Driver/cuda-openmp-driver.cu b/clang/test/Driver/cuda-openmp-driver.cu --- a/clang/test/Driver/cuda-openmp-driver.cu +++ b/clang/test/Driver/cuda-openmp-driver.cu @@ -13,9 +13,6 @@ // BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[BINARY]]"], output: "[[HOST_OBJ:.+]]" // BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" -// RUN: %clang -### -nocudalib --offload-new-driver %s 2>&1 | FileCheck -check-prefix RDC %s -// RDC: error: Using '--offload-new-driver' requires '-fgpu-rdc' - // RUN: %clang -### -target x86_64-linux-gnu -nocudalib -ccc-print-bindings -fgpu-rdc \ // RUN: --offload-new-driver --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \ // RUN: | FileCheck -check-prefix BINDINGS-HOST %s @@ -37,3 +34,10 @@ // RUN: | FileCheck -check-prefix DEVICE-LINK %s // DEVICE-LINK: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[INPUT:.+]]"], output: "a.out" + +// RUN: %clang -### -target x86_64-linux-gnu -nocudalib --offload-new-driver \ +// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \ +// RUN: | FileCheck -check-prefix GPU-BINARY %s + +// GPU-BINARY: fatbinary{{.*}}"--create" "{{.*}}.fatbin" +// GPU-BINARY: -cc1{{.*}}-fcuda-include-gpubinary" "{{.*}}.fatbin" diff --git a/clang/test/Driver/cuda-phases.cu b/clang/test/Driver/cuda-phases.cu --- a/clang/test/Driver/cuda-phases.cu +++ b/clang/test/Driver/cuda-phases.cu @@ -221,25 +221,48 @@ // // Test the phases generated when using the new offloading driver. // -// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver \ -// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s 2>&1 | FileCheck --check-prefix=NEW_DRIVER %s -// NEW_DRIVER: 0: input, "[[INPUT:.+]]", cuda -// NEW_DRIVER: 1: preprocessor, {0}, cuda-cpp-output -// NEW_DRIVER: 2: compiler, {1}, ir -// NEW_DRIVER: 3: input, "[[INPUT]]", cuda, (device-cuda, sm_52) -// NEW_DRIVER: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_52) -// NEW_DRIVER: 5: compiler, {4}, ir, (device-cuda, sm_52) -// NEW_DRIVER: 6: backend, {5}, assembler, (device-cuda, sm_52) -// NEW_DRIVER: 7: assembler, {6}, object, (device-cuda, sm_52) -// NEW_DRIVER: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object -// NEW_DRIVER: 9: input, "[[INPUT]]", cuda, (device-cuda, sm_70) -// NEW_DRIVER: 10: preprocessor, {9}, cuda-cpp-output, (device-cuda, sm_70) -// NEW_DRIVER: 11: compiler, {10}, ir, (device-cuda, sm_70) -// NEW_DRIVER: 12: backend, {11}, assembler, (device-cuda, sm_70) -// NEW_DRIVER: 13: assembler, {12}, object, (device-cuda, sm_70) -// NEW_DRIVER: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, object -// NEW_DRIVER: 15: clang-offload-packager, {8, 14}, image -// NEW_DRIVER: 16: offload, " (powerpc64le-ibm-linux-gnu)" {2}, " (powerpc64le-ibm-linux-gnu)" {15}, ir -// NEW_DRIVER: 17: backend, {16}, assembler, (host-cuda) -// NEW_DRIVER: 18: assembler, {17}, object, (host-cuda) -// NEW_DRIVER: 19: clang-linker-wrapper, {18}, image, (host-cuda) +// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver -fgpu-rdc \ +// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s 2>&1 | FileCheck --check-prefix=NEW-DRIVER-RDC %s +// NEW-DRIVER-RDC: 0: input, "[[INPUT:.+]]", cuda +// NEW-DRIVER-RDC: 1: preprocessor, {0}, cuda-cpp-output +// NEW-DRIVER-RDC: 2: compiler, {1}, ir +// NEW-DRIVER-RDC: 3: input, "[[INPUT]]", cuda, (device-cuda, sm_52) +// NEW-DRIVER-RDC: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_52) +// NEW-DRIVER-RDC: 5: compiler, {4}, ir, (device-cuda, sm_52) +// NEW-DRIVER-RDC: 6: backend, {5}, assembler, (device-cuda, sm_52) +// NEW-DRIVER-RDC: 7: assembler, {6}, object, (device-cuda, sm_52) +// NEW-DRIVER-RDC: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object +// NEW-DRIVER-RDC: 9: input, "[[INPUT]]", cuda, (device-cuda, sm_70) +// NEW-DRIVER-RDC: 10: preprocessor, {9}, cuda-cpp-output, (device-cuda, sm_70) +// NEW-DRIVER-RDC: 11: compiler, {10}, ir, (device-cuda, sm_70) +// NEW-DRIVER-RDC: 12: backend, {11}, assembler, (device-cuda, sm_70) +// NEW-DRIVER-RDC: 13: assembler, {12}, object, (device-cuda, sm_70) +// NEW-DRIVER-RDC: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, object +// NEW-DRIVER-RDC: 15: clang-offload-packager, {8, 14}, image +// NEW-DRIVER-RDC: 16: offload, " (powerpc64le-ibm-linux-gnu)" {2}, " (powerpc64le-ibm-linux-gnu)" {15}, ir +// NEW-DRIVER-RDC: 17: backend, {16}, assembler, (host-cuda) +// NEW-DRIVER-RDC: 18: assembler, {17}, object, (host-cuda) +// NEW-DRIVER-RDC: 19: clang-linker-wrapper, {18}, image, (host-cuda) + +// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver -fgpu-rdc \ +// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s 2>&1 | FileCheck --check-prefix=NEW-DRIVER %s +// NEW-DRIVER: 0: input, "[[INPUT:.+]]", cuda +// NEW-DRIVER: 1: preprocessor, {0}, cuda-cpp-output +// NEW-DRIVER: 2: compiler, {1}, ir +// NEW-DRIVER: 3: input, "[[INPUT]]", cuda, (device-cuda, sm_52) +// NEW-DRIVER: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_52) +// NEW-DRIVER: 5: compiler, {4}, ir, (device-cuda, sm_52) +// NEW-DRIVER: 6: backend, {5}, assembler, (device-cuda, sm_52) +// NEW-DRIVER: 7: assembler, {6}, object, (device-cuda, sm_52) +// NEW-DRIVER: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object +// NEW-DRIVER: 9: input, "[[INPUT]]", cuda, (device-cuda, sm_70) +// NEW-DRIVER: 10: preprocessor, {9}, cuda-cpp-output, (device-cuda, sm_70) +// NEW-DRIVER: 11: compiler, {10}, ir, (device-cuda, sm_70) +// NEW-DRIVER: 12: backend, {11}, assembler, (device-cuda, sm_70) +// NEW-DRIVER: 13: assembler, {12}, object, (device-cuda, sm_70) +// NEW-DRIVER: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, object +// NEW-DRIVER: 15: clang-offload-packager, {8, 14}, image +// NEW-DRIVER: 16: offload, " (powerpc64le-ibm-linux-gnu)" {2}, " (powerpc64le-ibm-linux-gnu)" {15}, ir +// NEW-DRIVER: 17: backend, {16}, assembler, (host-cuda) +// NEW-DRIVER: 18: assembler, {17}, object, (host-cuda) +// NEW-DRIVER: 19: clang-linker-wrapper, {18}, image, (host-cuda)