diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1124,6 +1124,10 @@ Group, HelpText<"Bundle output files of HIP device compilation">; def no_gpu_bundle_output : Flag<["--"], "no-gpu-bundle-output">, Group, HelpText<"Do not bundle output files of HIP device compilation">; +def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">, Group, + HelpText<"Compile HIP source to relocatable">; +def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">, Group, + HelpText<"Do not override toolchain to compile HIP source to relocatable">; def cuid_EQ : Joined<["-"], "cuid=">, Flags<[CC1Option]>, HelpText<"An ID for compilation unit, which should be the same for the same " "compilation unit but different for different compilation units. " diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -2946,7 +2946,12 @@ CudaActionBuilderBase(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs, Action::OffloadKind OFKind) - : DeviceActionBuilder(C, Args, Inputs, OFKind) {} + : DeviceActionBuilder(C, Args, Inputs, OFKind) { + + CompileDeviceOnly = C.getDriver().offloadDeviceOnly(); + Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, + options::OPT_fno_gpu_rdc, /*Default=*/false); + } ActionBuilderReturnCode addDeviceDependences(Action *HostAction) override { // While generating code for CUDA, we only depend on the host input action @@ -3099,9 +3104,6 @@ !C.hasOffloadToolChain()) return false; - Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, - options::OPT_fno_gpu_rdc, /*Default=*/false); - const ToolChain *HostTC = C.getSingleOffloadToolChain(); assert(HostTC && "No toolchain for host compilation."); if (HostTC->getTriple().isNVPTX() || @@ -3120,7 +3122,6 @@ : C.getSingleOffloadToolChain()); CompileHostOnly = C.getDriver().offloadHostOnly(); - CompileDeviceOnly = C.getDriver().offloadDeviceOnly(); EmitLLVM = Args.getLastArg(options::OPT_emit_llvm); EmitAsm = Args.getLastArg(options::OPT_S); FixedCUID = Args.getLastArgValue(options::OPT_cuid_EQ); @@ -3352,16 +3353,40 @@ // only compilation. Bundle other type of output files only if // --gpu-bundle-output is specified for device only compilation. std::optional BundleOutput; + std::optional EmitReloc; public: HIPActionBuilder(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs) : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) { + DefaultCudaArch = CudaArch::GFX906; + + if (Args.hasArg(options::OPT_fhip_emit_relocatable, + options::OPT_fno_hip_emit_relocatable)) { + EmitReloc = Args.hasFlag(options::OPT_fhip_emit_relocatable, + options::OPT_fno_hip_emit_relocatable, false); + + if (*EmitReloc) { + if (Relocatable) { + C.getDriver().Diag(diag::err_opt_not_valid_with_opt) + << "-fhip-emit-relocatable" + << "-fgpu-rdc"; + } + + if (!CompileDeviceOnly) { + C.getDriver().Diag(diag::err_opt_not_valid_without_opt) + << "-fhip-emit-relocatable" + << "--cuda-device-only"; + } + } + } + if (Args.hasArg(options::OPT_gpu_bundle_output, options::OPT_no_gpu_bundle_output)) BundleOutput = Args.hasFlag(options::OPT_gpu_bundle_output, - options::OPT_no_gpu_bundle_output, true); + options::OPT_no_gpu_bundle_output, true) && + (!EmitReloc || !*EmitReloc); } bool canUseBundlerUnbundler() const override { return true; } @@ -3408,8 +3433,10 @@ assert(!CompileHostOnly && "Not expecting HIP actions in host-only compilation."); + bool ShouldLink = !EmitReloc || !*EmitReloc; + if (!Relocatable && CurPhase == phases::Backend && !EmitLLVM && - !EmitAsm) { + !EmitAsm && ShouldLink) { // If we are in backend phase, we attempt to generate the fat binary. // We compile each arch to IR and use a link action to generate code // object containing ISA. Then we use a special "link" action to create @@ -3485,6 +3512,8 @@ return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success; } else if (CurPhase == phases::Link) { + if (!ShouldLink) + return ABRT_Success; // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch. // This happens to each device action originated from each input file. // Later on, device actions in DeviceLinkerInputs are used to create @@ -3522,8 +3551,11 @@ CudaDeviceActions.clear(); } - return (CompileDeviceOnly && CurPhase == FinalPhase) ? ABRT_Ignore_Host - : ABRT_Success; + return (CompileDeviceOnly && + (CurPhase == FinalPhase || + (!ShouldLink && CurPhase == phases::Assemble))) + ? ABRT_Ignore_Host + : ABRT_Success; } void appendLinkDeviceActions(ActionList &AL) override { @@ -3674,7 +3706,6 @@ ++InactiveBuilders; continue; } - auto RetCode = SB->getDeviceDependences(DDeps, CurPhase, FinalPhase, Phases); diff --git a/clang/test/Driver/hip-dependent-options.hip b/clang/test/Driver/hip-dependent-options.hip new file mode 100644 --- /dev/null +++ b/clang/test/Driver/hip-dependent-options.hip @@ -0,0 +1,17 @@ +// RUN: %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ +// RUN: -c -fhip-emit-relocatable -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \ +// RUN: 2>&1 | FileCheck -check-prefixes=RELOCRDC %s + +// RELOCRDC: error: option '-fhip-emit-relocatable' cannot be specified with '-fgpu-rdc' + +// RUN: %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ +// RUN: -c -fhip-emit-relocatable -nogpuinc -nogpulib \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \ +// RUN: 2>&1 | FileCheck -check-prefixes=RELOCHOST %s + +// RELOCHOST: error: option '-fhip-emit-relocatable' cannot be specified without '--cuda-device-only' diff --git a/clang/test/Driver/hip-device-compile.hip b/clang/test/Driver/hip-device-compile.hip --- a/clang/test/Driver/hip-device-compile.hip +++ b/clang/test/Driver/hip-device-compile.hip @@ -45,6 +45,14 @@ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,ASM,NBUN %s +// Output relocatable. +// RUN: %clang -c --cuda-device-only -### --target=x86_64-linux-gnu \ +// RUN: -o a.o -x hip --cuda-gpu-arch=gfx900 -fhip-emit-relocatable \ +// RUN: --hip-device-lib=lib1.bc \ +// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NBUN,RELOC %s + // Output bundled assembly. // RUN: %clang -c -S --cuda-device-only -### --target=x86_64-linux-gnu \ // RUN: -o a.s -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \ @@ -68,6 +76,7 @@ // LLBUN-SAME: "-o" "{{.*}}.ll" // ASM-SAME: "-o" "a.s" // ASMBUN-SAME: "-o" "{{.*}}.s" +// RELOC-SAME: "-o" "a.o" // CHECK-SAME: {{".*a.cu"}} // CHECK-NOT: {{"*.llvm-link"}} diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip --- a/clang/test/Driver/hip-phases.hip +++ b/clang/test/Driver/hip-phases.hip @@ -244,6 +244,43 @@ // DASM-NOT: clang-offload-bundler // DASM-NOT: host +// +// Test single gpu architecture with compile to relocatable in device-only +// compilation mode. +// +// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -fhip-emit-relocatable 2>&1 \ +// RUN: | FileCheck -check-prefixes=RELOC %s +// RELOC-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// RELOC-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// RELOC-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// RELOC-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) +// RELOC-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]]) +// RELOC-NOT: linker +// RELOC-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P4]]}, object + +// +// Test two gpu architectures with compile to relocatable in device-only +// compilation mode. +// +// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only -fhip-emit-relocatable 2>&1 \ +// RUN: | FileCheck -check-prefixes=RELOC2 %s +// RELOC2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// RELOC2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// RELOC2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// RELOC2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) +// RELOC2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]]) +// RELOC2-NOT: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (device-[[T]], [[ARCH]]) +// RELOC2-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P4]]}, object +// RELOC2-DAG: [[P6:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH2:gfx900]]) +// RELOC2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// RELOC2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-[[T]], [[ARCH2]]) +// RELOC2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-[[T]], [[ARCH2]]) +// RELOC2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-[[T]], [[ARCH2]]) +// RELOC2-NOT: linker +// RELOC2-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P10]]}, object + // // Test two gpu architectures with complete compilation in device-only // compilation mode. diff --git a/clang/test/Driver/hip-rdc-device-only.hip b/clang/test/Driver/hip-rdc-device-only.hip --- a/clang/test/Driver/hip-rdc-device-only.hip +++ b/clang/test/Driver/hip-rdc-device-only.hip @@ -18,6 +18,16 @@ // RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \ // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITBC %s +// With `-fno-hip-emit-relocatable`, the output should be the same as the aforementioned line +// as `-fgpu-rdc` in HIP implies `-fno-hip-emit-relocatable`. + +// RUN: %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ +// RUN: -c -fno-hip-emit-relocatable -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \ +// RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITBC %s + // RUN: %clang -### --target=x86_64-linux-gnu \ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ // RUN: -S -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \