diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -100,6 +100,9 @@ Generic, // A processor model named 'generic' if the target backend defines a // public one. LAST, + + CudaDefault = CudaArch::SM_35, + HIPDefault = CudaArch::GFX803, }; static inline bool IsNVIDIAGpuArch(CudaArch A) { diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -63,6 +63,8 @@ "cannot find libdevice for %0; provide path to different CUDA installation " "via '--cuda-path', or pass '-nocudalib' to build without linking with " "libdevice">; +def warn_drv_no_rdc_new_driver : Warning< + "Using '-foffload-new-driver' overrides '-fno-gpu-rdc'">; def err_drv_no_rocm_device_lib : Error< "cannot find ROCm device library%select{| for %1|for ABI version %1}0; provide its path via " diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -4139,6 +4139,101 @@ Args.ClaimAllArgs(options::OPT_cuda_compile_host_device); } +/// Returns the canonical name for the offloading architecture when using HIP or +/// CUDA. +static StringRef getCanonicalArchString(Compilation &C, + llvm::opt::DerivedArgList &Args, + StringRef ArchStr, + Action::OffloadKind Kind) { + if (Kind == Action::OFK_Cuda) { + CudaArch Arch = StringToCudaArch(ArchStr); + if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) { + C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr; + return StringRef(); + } + return Args.MakeArgStringRef(CudaArchToString(Arch)); + } else if (Kind == Action::OFK_HIP) { + llvm::StringMap Features; + // getHIPOffloadTargetTriple() is known to return valid value as it has + // been called successfully in the CreateOffloadingDeviceToolChains(). + auto Arch = parseTargetID( + *getHIPOffloadTargetTriple(C.getDriver(), C.getInputArgs()), ArchStr, + &Features); + if (!Arch) { + C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << ArchStr; + C.setContainsError(); + return StringRef(); + } + return Args.MakeArgStringRef( + getCanonicalTargetID(Arch.getValue(), Features)); + } + return StringRef(); +} + +/// Checks if the set offloading architectures does not conflict. Returns the +/// incompatible pair if a conflict occurs. +static llvm::Optional> +getConflictOffloadArchCombination(const llvm::DenseSet &Archs, + Action::OffloadKind Kind) { + if (Kind != Action::OFK_HIP) + return None; + + std::set ArchSet; + llvm::copy(Archs, std::inserter(ArchSet, ArchSet.begin())); + return getConflictTargetIDCombination(ArchSet); +} + +/// Returns the set of bound architectures active for this compilation kind. +/// This function returns a set of bound architectures, if there are no bound +/// architctures we return a set containing only the empty string. +static llvm::DenseSet +getOffloadArchs(Compilation &C, llvm::opt::DerivedArgList &Args, + Action::OffloadKind Kind) { + + // If this is OpenMP offloading we don't use a bound architecture. + if (Kind == Action::OFK_OpenMP) + return llvm::DenseSet{StringRef()}; + + // --offload and --offload-arch options are mutually exclusive. + if (Args.hasArgNoClaim(options::OPT_offload_EQ) && + Args.hasArgNoClaim(options::OPT_offload_arch_EQ, + options::OPT_no_offload_arch_EQ)) { + C.getDriver().Diag(diag::err_opt_not_valid_with_opt) + << "--offload" + << (Args.hasArgNoClaim(options::OPT_offload_arch_EQ) + ? "--offload-arch" + : "--no-offload-arch"); + } + + llvm::DenseSet Archs; + for (auto &Arg : Args) { + if (Arg->getOption().matches(options::OPT_offload_arch_EQ)) { + Archs.insert(getCanonicalArchString(C, Args, Arg->getValue(), Kind)); + } else if (Arg->getOption().matches(options::OPT_no_offload_arch_EQ)) { + if (Arg->getValue() == StringRef("all")) + Archs.clear(); + else + Archs.erase(getCanonicalArchString(C, Args, Arg->getValue(), Kind)); + } + } + + if (auto ConflictingArchs = getConflictOffloadArchCombination(Archs, Kind)) { + C.getDriver().Diag(clang::diag::err_drv_bad_offload_arch_combo) + << ConflictingArchs.getValue().first + << ConflictingArchs.getValue().second; + C.setContainsError(); + } + + if (Archs.empty()) { + if (Kind == Action::OFK_Cuda) + Archs.insert(CudaArchToString(CudaArch::CudaDefault)); + else if (Kind == Action::OFK_HIP) + Archs.insert(CudaArchToString(CudaArch::HIPDefault)); + } + + return Archs; +} + Action *Driver::BuildOffloadingActions(Compilation &C, llvm::opt::DerivedArgList &Args, const InputTy &Input, @@ -4151,7 +4246,8 @@ types::ID InputType = Input.first; const Arg *InputArg = Input.second; - const Action::OffloadKind OffloadKinds[] = {Action::OFK_OpenMP}; + const Action::OffloadKind OffloadKinds[] = { + Action::OFK_OpenMP, Action::OFK_Cuda, Action::OFK_HIP}; for (Action::OffloadKind Kind : OffloadKinds) { SmallVector ToolChains; @@ -4164,7 +4260,13 @@ if (ToolChains.empty()) continue; - for (unsigned I = 0; I < ToolChains.size(); ++I) + // Get the product of all bound architectures and toolchains. + SmallVector> TCAndArchs; + for (const ToolChain *TC : ToolChains) + for (StringRef Arch : getOffloadArchs(C, Args, Kind)) + TCAndArchs.push_back(std::make_pair(TC, Arch)); + + for (unsigned I = 0, E = TCAndArchs.size(); I != E; ++I) DeviceActions.push_back(C.MakeAction(*InputArg, InputType)); if (DeviceActions.empty()) @@ -4178,27 +4280,41 @@ break; } - auto TC = ToolChains.begin(); + auto TCAndArch = TCAndArchs.begin(); for (Action *&A : DeviceActions) { A = ConstructPhaseAction(C, Args, Phase, A, Kind); if (isa(A) && Kind == Action::OFK_OpenMP) { + // OpenMP offloading has a dependency on the host compile action to + // identify which declarations need to be emitted. This shouldn't be + // collapsed with any other actions so we can use it in the device. HostAction->setCannotBeCollapsedWithNextDependentAction(); OffloadAction::HostDependence HDep( *HostAction, *C.getSingleOffloadToolChain(), - /*BourdArch=*/nullptr, Action::OFK_OpenMP); + /*BoundArch=*/nullptr, Kind); OffloadAction::DeviceDependences DDep; - DDep.add(*A, **TC, /*BoundArch=*/nullptr, Kind); + DDep.add(*A, *TCAndArch->first, /*BoundArch=*/nullptr, Kind); A = C.MakeAction(HDep, DDep); + } else if (isa(A) && Kind == Action::OFK_Cuda) { + // The Cuda toolchain uses fatbinary as the linker phase to bundle the + // PTX and Cubin output. + ActionList FatbinActions; + for (Action *A : {A, A->getInputs()[0]}) { + OffloadAction::DeviceDependences DDep; + DDep.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind); + FatbinActions.emplace_back( + C.MakeAction(DDep, A->getType())); + } + A = C.MakeAction(FatbinActions, types::TY_CUDA_FATBIN); } - ++TC; + ++TCAndArch; } } - auto TC = ToolChains.begin(); + auto TCAndArch = TCAndArchs.begin(); for (Action *A : DeviceActions) { - DDeps.add(*A, **TC, /*BoundArch=*/nullptr, Kind); - TC++; + DDeps.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind); + ++TCAndArch; } } @@ -4303,7 +4419,7 @@ return C.MakeAction(Input, Output); } if (isUsingLTO(/* IsOffload */ true) && - TargetDeviceOffloadKind == Action::OFK_OpenMP) { + TargetDeviceOffloadKind != Action::OFK_None) { types::ID Output = Args.hasArg(options::OPT_S) ? types::TY_LTO_IR : types::TY_LTO_BC; return C.MakeAction(Input, Output); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6222,7 +6222,12 @@ } if (IsCuda || IsHIP) { - if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) + if (Args.hasArg(options::OPT_fno_gpu_rdc) && IsCudaDevice && + Args.hasArg(options::OPT_foffload_new_driver)) + D.Diag(diag::warn_drv_no_rdc_new_driver) + << "SampleUse with PGO options"; + if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) || + Args.hasArg(options::OPT_foffload_new_driver)) CmdArgs.push_back("-fgpu-rdc"); if (Args.hasFlag(options::OPT_fgpu_defer_diag, options::OPT_fno_gpu_defer_diag, false)) @@ -6887,7 +6892,8 @@ if ((IsCuda || IsHIP) && CudaDeviceInput) { CmdArgs.push_back("-fcuda-include-gpubinary"); CmdArgs.push_back(CudaDeviceInput->getFilename()); - if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) + if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, + false)) CmdArgs.push_back("-fgpu-rdc"); } @@ -8203,14 +8209,17 @@ ArgStringList CmdArgs; // Pass the CUDA path to the linker wrapper tool. - for (auto &I : llvm::make_range(OpenMPTCRange.first, OpenMPTCRange.second)) { - const ToolChain *TC = I.second; - if (TC->getTriple().isNVPTX()) { - CudaInstallationDetector CudaInstallation(D, TheTriple, Args); - if (CudaInstallation.isValid()) - CmdArgs.push_back(Args.MakeArgString( - "--cuda-path=" + CudaInstallation.getInstallPath())); - break; + for (Action::OffloadKind Kind : {Action::OFK_Cuda, Action::OFK_OpenMP}) { + auto TCRange = C.getOffloadToolChains(Kind); + for (auto &I : llvm::make_range(TCRange.first, TCRange.second)) { + const ToolChain *TC = I.second; + if (TC->getTriple().isNVPTX()) { + CudaInstallationDetector CudaInstallation(D, TheTriple, Args); + if (CudaInstallation.isValid()) + CmdArgs.push_back(Args.MakeArgString( + "--cuda-path=" + CudaInstallation.getInstallPath())); + break; + } } } diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -461,8 +461,9 @@ options::OPT_fnoopenmp_relocatable_target, /*Default=*/true); else if (JA.isOffloading(Action::OFK_Cuda)) - Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, - options::OPT_fno_gpu_rdc, /*Default=*/false); + Relocatable = Args.hasArg(options::OPT_foffload_new_driver) || + Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, + /*Default=*/false); if (Relocatable) CmdArgs.push_back("-c"); diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -222,7 +222,8 @@ CC1Args.push_back("-fcuda-approx-transcendentals"); if (!DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, - false)) + false) || + DriverArgs.hasArg(options::OPT_foffload_new_driver)) CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"}); StringRef MaxThreadsPerBlock = diff --git a/clang/test/Driver/cuda-openmp-driver.cu b/clang/test/Driver/cuda-openmp-driver.cu new file mode 100644 --- /dev/null +++ b/clang/test/Driver/cuda-openmp-driver.cu @@ -0,0 +1,19 @@ +// REQUIRES: x86-registered-target +// REQUIRES: nvptx-registered-target + +// RUN: %clang -### -target x86_64-linux-gnu -nocudalib -ccc-print-bindings -fgpu-rdc \ +// RUN: -foffload-new-driver --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \ +// RUN: | FileCheck -check-prefix BINDINGS %s + +// BINDINGS: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX_SM_35:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_35]]"], output: "[[CUBIN_SM_35:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Linker", inputs: ["[[CUBIN_SM_35]]", "[[PTX_SM_35]]"], output: "[[FATBIN_SM_35:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]"], output: "[[PTX_SM_70:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_70:.+]]"], output: "[[CUBIN_SM_70:.+]]" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Linker", inputs: ["[[CUBIN_SM_70]]", "[[PTX_SM_70:.+]]"], output: "[[FATBIN_SM_70:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[FATBIN_SM_35]]", "[[FATBIN_SM_70]]"], output: "[[HOST_OBJ:.+]]" +// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out" + +// RUN: %clang -### -nocudalib -foffload-new-driver %s 2>&1 | FileCheck -check-prefix RDC %s +// RDC: clang{{.*}}"-cc1" "-triple" "nvptx64-nvidia-cuda"{{.*}}"-fgpu-rdc" +// RDC: ptxas{{.*}}-c diff --git a/clang/test/Driver/cuda-phases.cu b/clang/test/Driver/cuda-phases.cu --- a/clang/test/Driver/cuda-phases.cu +++ b/clang/test/Driver/cuda-phases.cu @@ -217,3 +217,32 @@ // DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]]) // DASM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler // DASM2-NOT: host + +// +// Test the phases generated when using the new offloading driver. +// +// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases -foffload-new-driver \ +// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s 2>&1 | FileCheck --check-prefix=NEW_DRIVER %s +// NEW_DRIVER: 0: input, "[[INPUT:.*]]", cuda, (host-cuda) +// NEW_DRIVER: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) +// NEW_DRIVER: 2: compiler, {1}, ir, (host-cuda) +// NEW_DRIVER: 3: input, "[[INPUT]]", cuda, (device-cuda, sm_52) +// NEW_DRIVER: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_52) +// NEW_DRIVER: 5: compiler, {4}, ir, (device-cuda, sm_52) +// NEW_DRIVER: 6: backend, {5}, assembler, (device-cuda, sm_52) +// NEW_DRIVER: 7: assembler, {6}, object, (device-cuda, sm_52) +// NEW_DRIVER: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object +// NEW_DRIVER: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {6}, assembler +// NEW_DRIVER: 10: linker, {8, 9}, cuda-fatbin, (device-cuda, sm_52) +// NEW_DRIVER: 11: input, "[[INPUT]]", cuda, (device-cuda, sm_70) +// NEW_DRIVER: 12: preprocessor, {11}, cuda-cpp-output, (device-cuda, sm_70) +// NEW_DRIVER: 13: compiler, {12}, ir, (device-cuda, sm_70) +// NEW_DRIVER: 14: backend, {13}, assembler, (device-cuda, sm_70) +// NEW_DRIVER: 15: assembler, {14}, object, (device-cuda, sm_70) +// NEW_DRIVER: 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {15}, object +// NEW_DRIVER: 17: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {14}, assembler +// NEW_DRIVER: 18: linker, {16, 17}, cuda-fatbin, (device-cuda, sm_70) +// NEW_DRIVER: 19: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {10}, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {18}, ir +// NEW_DRIVER: 20: backend, {19}, assembler, (host-cuda) +// NEW_DRIVER: 21: assembler, {20}, object, (host-cuda) +// NEW_DRIVER: 22: clang-linker-wrapper, {21}, image, (host-cuda)