Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -988,6 +988,10 @@ def fgpu_sanitize : Flag<["-"], "fgpu-sanitize">, Group, HelpText<"Enable sanitizer for AMDGPU target">; def fno_gpu_sanitize : Flag<["-"], "fno-gpu-sanitize">, Group; +def gpu_bundle_output : Flag<["--"], "gpu-bundle-output">, + Group, HelpText<"Bundle output files of HIP device compilation">; +def no_gpu_bundle_output : Flag<["--"], "no-gpu-bundle-output">, + Group, HelpText<"Do not bundle output files of HIP device compilation">; def cuid_EQ : Joined<["-"], "cuid=">, Flags<[CC1Option]>, HelpText<"An ID for compilation unit, which should be the same for the same " "compilation unit but different for different compilation units. " Index: clang/lib/Driver/Driver.cpp =================================================================== --- clang/lib/Driver/Driver.cpp +++ clang/lib/Driver/Driver.cpp @@ -2900,6 +2900,7 @@ /// The linker inputs obtained for each device arch. SmallVector DeviceLinkerInputs; bool GPUSanitize; + Optional BundleOutput; public: HIPActionBuilder(Compilation &C, DerivedArgList &Args, @@ -2908,6 +2909,12 @@ DefaultCudaArch = CudaArch::GFX803; GPUSanitize = Args.hasFlag(options::OPT_fgpu_sanitize, options::OPT_fno_gpu_sanitize, false); + // The default bundling behavior depends on the type of output, therefore + // BundleOutput needs to be tri-value: None, true, or false. + if (Args.hasArg(options::OPT_gpu_bundle_output, + options::OPT_no_gpu_bundle_output)) + BundleOutput = Args.hasFlag(options::OPT_gpu_bundle_output, + options::OPT_no_gpu_bundle_output); } bool canUseBundlerUnbundler() const override { return true; } @@ -2997,22 +3004,27 @@ CudaDeviceActions[I] = C.MakeAction( DDep, CudaDeviceActions[I]->getType()); } - // Create HIP fat binary with a special "link" action. - CudaFatBinary = - C.MakeAction(CudaDeviceActions, - types::TY_HIP_FATBIN); - if (!CompileDeviceOnly) { - DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr, - AssociatedOffloadKind); - // Clear the fat binary, it is already a dependence to an host - // action. - CudaFatBinary = nullptr; - } + // Bundle code objects except --no-gpu-output is specified for device + // only compilation. + if (!CompileDeviceOnly || !BundleOutput.hasValue() || + BundleOutput.getValue()) { + // Create HIP fat binary with a special "link" action. + CudaFatBinary = C.MakeAction(CudaDeviceActions, + types::TY_HIP_FATBIN); - // Remove the CUDA actions as they are already connected to an host - // action or fat binary. - CudaDeviceActions.clear(); + if (!CompileDeviceOnly) { + DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr, + AssociatedOffloadKind); + // Clear the fat binary, it is already a dependence to an host + // action. + CudaFatBinary = nullptr; + } + + // Remove the CUDA actions as they are already connected to an host + // action or fat binary. + CudaDeviceActions.clear(); + } return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success; } else if (CurPhase == phases::Link) { @@ -3039,6 +3051,22 @@ A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A, AssociatedOffloadKind); + // Bundle output files if --gpu-bundle-output is set for device only + // compilation. + if (CompileDeviceOnly && CurPhase == FinalPhase && + BundleOutput.hasValue() && BundleOutput.getValue()) { + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + OffloadAction::DeviceDependences DDep; + DDep.add(*CudaDeviceActions[I], *ToolChains.front(), GpuArchList[I], + AssociatedOffloadKind); + CudaDeviceActions[I] = C.MakeAction( + DDep, CudaDeviceActions[I]->getType()); + } + CudaFatBinary = + C.MakeAction(CudaDeviceActions); + CudaDeviceActions.clear(); + } + return (CompileDeviceOnly && CurPhase == FinalPhase) ? ABRT_Ignore_Host : ABRT_Success; } Index: clang/test/Driver/clang-offload-bundler.c =================================================================== --- clang/test/Driver/clang-offload-bundler.c +++ clang/test/Driver/clang-offload-bundler.c @@ -361,6 +361,21 @@ // CKLST2-NOT: openmp-powerpc64le-ibm-linux-gnu // CKLST2-NOT: openmp-x86_64-pc-linux-gnu +// +// Check bundling without host target is allowed for HIP. +// +// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 \ +// RUN: -inputs=%t.tgt1,%t.tgt2 -outputs=%t.hip.bundle.bc +// RUN: clang-offload-bundler -type=bc -list -inputs=%t.hip.bundle.bc | FileCheck -check-prefix=NOHOST %s +// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 \ +// RUN: -outputs=%t.res.tgt1,%t.res.tgt2 -inputs=%t.hip.bundle.bc -unbundle +// RUN: diff %t.tgt1 %t.res.tgt1 +// RUN: diff %t.tgt2 %t.res.tgt2 +// +// NOHOST-NOT: host- +// NOHOST-DAG: hip-amdgcn-amd-amdhsa-gfx900 +// NOHOST-DAG: hip-amdgcn-amd-amdhsa-gfx906 + // Some code so that we can create a binary out of this file. int A = 0; void test_func(void) { Index: clang/test/Driver/hip-device-compile.hip =================================================================== --- clang/test/Driver/hip-device-compile.hip +++ clang/test/Driver/hip-device-compile.hip @@ -3,28 +3,56 @@ // REQUIRES: amdgpu-registered-target // If -emit-llvm and/or -S is used in device only compilation, -// the output should not be bundled. +// the output should not be bundled, except --gpu-bundle-output +// is specified. +// Output unbundled bitcode. // RUN: %clang -c -emit-llvm --cuda-device-only -### -target x86_64-linux-gnu \ -// RUN: -o a.bc -x hip --cuda-gpu-arch=gfx900 \ +// RUN: -o a.bc -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \ // RUN: --hip-device-lib=lib1.bc \ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ -// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,BC %s +// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,BC,NBUN %s +// Output bundled bitcode. +// RUN: %clang -c -emit-llvm --cuda-device-only -### -target x86_64-linux-gnu \ +// RUN: -o a.bc -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \ +// RUN: --hip-device-lib=lib1.bc \ +// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu --gpu-bundle-output \ +// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,BCBUN %s + +// Output unbundled LLVM IR. // RUN: %clang -c -S -emit-llvm --cuda-device-only -### -target x86_64-linux-gnu \ -// RUN: -o a.ll -x hip --cuda-gpu-arch=gfx900 \ +// RUN: -o a.ll -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \ // RUN: --hip-device-lib=lib1.bc \ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ -// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LL %s +// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LL,NBUN %s + +// Output bundled LLVM IR. +// RUN: %clang -c -S -emit-llvm --cuda-device-only -### -target x86_64-linux-gnu \ +// RUN: -o a.ll -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \ +// RUN: --hip-device-lib=lib1.bc \ +// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu --gpu-bundle-output \ +// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LLBUN %s +// Output unbundled assembly. // RUN: %clang -c -S --cuda-device-only -### -target x86_64-linux-gnu \ -// RUN: -o a.s -x hip --cuda-gpu-arch=gfx900 \ +// RUN: -o a.s -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \ // RUN: --hip-device-lib=lib1.bc \ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ -// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,ASM %s +// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,ASM,NBUN %s + +// Output bundled assembly. +// RUN: %clang -c -S --cuda-device-only -### -target x86_64-linux-gnu \ +// RUN: -o a.s -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \ +// RUN: --hip-device-lib=lib1.bc \ +// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu --gpu-bundle-output \ +// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,ASMBUN %s // CHECK: {{".*clang.*"}} "-cc1" "-triple" "amdgcn-amd-amdhsa" // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" @@ -36,38 +64,64 @@ // CHECK-SAME: {{".*lib1.bc"}} // CHECK-SAME: "-target-cpu" "gfx900" // BC-SAME: "-o" "a.bc" +// BCBUN-SAME: "-o" "{{.*}}.bc" // LL-SAME: "-o" "a.ll" +// LLBUN-SAME: "-o" "{{.*}}.ll" // ASM-SAME: "-o" "a.s" +// ASMBUN-SAME: "-o" "{{.*}}.s" // CHECK-SAME: {{".*a.cu"}} // CHECK-NOT: {{"*.llvm-link"}} // CHECK-NOT: {{".*opt"}} // CHECK-NOT: {{".*llc"}} // CHECK-NOT: {{".*lld.*"}} -// CHECK-NOT: {{".*clang-offload-bundler"}} +// NBUN-NOT: {{".*clang-offload-bundler"}} +// BCBUN: {{".*clang-offload-bundler"}}{{.*}}"-outputs=a.bc" +// LLBUN: {{".*clang-offload-bundler"}}{{.*}}"-outputs=a.ll" +// ASMBUN: {{".*clang-offload-bundler"}}{{.*}}"-outputs=a.s" // CHECK-NOT: {{".*ld.*"}} // If neither -emit-llvm nor -S is used in device only compilation, -// the output should be bundled. +// the output should be bundled except --no-gpu-bundle-output is +// specified. +// Output bundled code objects. // RUN: %clang -c --cuda-device-only -### -target x86_64-linux-gnu \ -// RUN: -o a.s -x hip --cuda-gpu-arch=gfx900 \ +// RUN: -o a.o -x hip --cuda-gpu-arch=gfx900 \ // RUN: --hip-device-lib=lib1.bc \ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ -// RUN: 2>&1 | FileCheck -check-prefixes=BUNDLE %s +// RUN: 2>&1 | FileCheck -check-prefixes=OBJ,OBJ-BUN %s +// Output unbundled code objects. +// RUN: %clang -c --cuda-device-only -### -target x86_64-linux-gnu \ +// RUN: -o a.o -x hip --cuda-gpu-arch=gfx900 \ +// RUN: --hip-device-lib=lib1.bc \ +// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu --no-gpu-bundle-output \ +// RUN: 2>&1 | FileCheck -check-prefixes=OBJ,OBJ-UBUN %s + +// Output bundled code objects. // RUN: %clang --cuda-device-only -### -target x86_64-linux-gnu \ -// RUN: -o a.s -x hip --cuda-gpu-arch=gfx900 \ +// RUN: -o a.o -x hip --cuda-gpu-arch=gfx900 \ // RUN: --hip-device-lib=lib1.bc \ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ -// RUN: 2>&1 | FileCheck -check-prefixes=BUNDLE %s +// RUN: 2>&1 | FileCheck -check-prefixes=OBJ,OBJ-BUN %s -// BUNDLE: {{"*.clang.*"}} {{.*}} "-emit-obj" -// BUNDLE-NOT: {{"*.llvm-link"}} -// BUNDLE-NOT: {{".*opt"}} -// BUNDLE-NOT: {{".*llc"}} -// BUNDLE: {{".*lld.*"}} -// BUNDLE: {{".*clang-offload-bundler"}} +// Output unbundled code objects. +// RUN: %clang --cuda-device-only -### -target x86_64-linux-gnu \ +// RUN: -o a.o -x hip --cuda-gpu-arch=gfx900 \ +// RUN: --hip-device-lib=lib1.bc \ +// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu --no-gpu-bundle-output \ +// RUN: 2>&1 | FileCheck -check-prefixes=OBJ,OBJ-UBUN %s +// OBJ: {{"*.clang.*"}} {{.*}} "-emit-obj" +// OBJ-NOT: {{"*.llvm-link"}} +// OBJ-NOT: {{".*opt"}} +// OBJ-NOT: {{".*llc"}} +// OBJ-BUN: {{".*lld.*"}}{{.*}}"-o" "{{.*}}.o" +// OBJ-UBUN: {{".*lld.*"}}{{.*}}"-o" "a.o" +// OBJ-BUN: {{".*clang-offload-bundler"}}{{.*}}"-outputs=a.o" +// OBJ-UBUN-NOT: {{".*clang-offload-bundler"}} Index: clang/test/Driver/hip-output-file-name.hip =================================================================== --- clang/test/Driver/hip-output-file-name.hip +++ clang/test/Driver/hip-output-file-name.hip @@ -2,6 +2,7 @@ // REQUIRES: x86-registered-target // REQUIRES: amdgpu-registered-target +// Output bundled code objects for combined compilation. // RUN: %clang -### -c -target x86_64-linux-gnu -fgpu-rdc \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ // RUN: 2>&1 | FileCheck %s @@ -9,43 +10,77 @@ // CHECK: {{.*}}clang-offload-bundler{{.*}}"-outputs=hip-output-file-name.o" // Check -E default output is "-" (stdout). +// If there are multiple preprocessor expansion outputs clang-offload-bundler +// is used to bundle the final output. +// Output bundled PPE for one GPU for mixed compliation. +// RUN: %clang -### -E -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=gfx803 %s \ +// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s + +// Output unbundled PPE for one GPU for device only compilation. +// RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=gfx803 %s \ +// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s + +// Output bundled PPE for two GPUs for mixed compilation. // RUN: %clang -### -E -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ // RUN: 2>&1 | FileCheck -check-prefixes=DASH %s +// Output bundled PPE for two GPUs for mixed compilation with -save-temps. // RUN: %clang -### -E -save-temps -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ // RUN: 2>&1 | FileCheck -check-prefixes=DASH %s +// Output unbundled PPE for two GPUs for device only compilation. // RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ // RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s +// Output bundled PPE for two GPUs for device only compilation with --gpu-bundle-output. +// RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --gpu-bundle-output \ +// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s + +// Output unbundled PPE for two GPUs for device only compilation with --no-gpu-bundle-output. +// RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --no-gpu-bundle-output \ +// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s + +// Output unbundled PPE for host only compilation. // RUN: %clang -### -E --cuda-host-only -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ // RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s +// DASH-NOT: {{.*}}clang{{.*}}"-o" "-" // DASH: {{.*}}clang-offload-bundler{{.*}}"-outputs=-" // CLANG-DASH: {{.*}}clang{{.*}}"-o" "-" +// CLANG-DASH-NOT: {{.*}}clang-offload-bundler{{.*}}"-outputs=-" // Check -E with -o. +// Output bundled PPE for two GPUs for mixed compilation. // RUN: %clang -### -E -o test.cui -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ // RUN: 2>&1 | FileCheck -check-prefixes=OUT %s +// Output bundled PPE for two GPUs for mixed compilation. // RUN: %clang -### -E -o test.cui -save-temps -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ // RUN: 2>&1 | FileCheck -check-prefixes=OUT %s +// Output bundled PPE for two GPUs for device only compilation with --gpu-bundle-output. // RUN: %clang -### -E -o test.cui --cuda-device-only -target x86_64-linux-gnu \ -// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ -// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-OUT %s +// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 --gpu-bundle-output %s \ +// RUN: 2>&1 | FileCheck -check-prefixes=OUT %s +// Output unbundled PPE for two GPUs for device only compilation. // RUN: %clang -### -E -o test.cui --cuda-host-only -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ // RUN: 2>&1 | FileCheck -check-prefixes=CLANG-OUT %s +// OUT-NOT: {{.*}}clang{{.*}}"-o" "test.cui" // OUT: {{.*}}clang-offload-bundler{{.*}}"-outputs=test.cui" // CLANG-OUT: {{.*}}clang{{.*}}"-o" "test.cui" +// CLANG-OUT-NOT: {{.*}}clang-offload-bundler{{.*}}"-outputs=test.cui" Index: clang/test/Driver/hip-phases.hip =================================================================== --- clang/test/Driver/hip-phases.hip +++ clang/test/Driver/hip-phases.hip @@ -231,13 +231,14 @@ // compilation mode. // // RUN: %clang -x hip -target x86_64-unknown-linux-gnu -ccc-print-phases \ -// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \ +// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S --no-gpu-bundle-output 2>&1 \ // RUN: | FileCheck -check-prefixes=DASM %s // DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) // DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) // DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) // DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) // DASM-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, assembler +// DASM-NOT: clang-offload-bundler // DASM-NOT: host // @@ -270,8 +271,20 @@ // // RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ // RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -S -o %t.s 2>&1 \ +// RUN: | FileCheck -check-prefixes=DASM2,DASM2-NOBUNDLE %s +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -S -o %t.s --no-gpu-bundle-output 2>&1 \ +// RUN: | FileCheck -check-prefixes=DASM2,DASM2-NOBUNDLE %s +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ // RUN: --cuda-device-only -S 2>&1 \ -// RUN: | FileCheck -check-prefixes=DASM2 %s +// RUN: | FileCheck -check-prefixes=DASM2,DASM2-NOBUNDLE %s +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -S --gpu-bundle-output 2>&1 \ +// RUN: | FileCheck -check-prefixes=DASM2,DASM2-BUNDLE %s // DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) // DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) // DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) @@ -282,6 +295,8 @@ // DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]]) // DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]]) // DASM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, assembler +// DASM2-BUNDLE: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, assembler, (device-hip, ) +// DASM2-NOBUNDLE-NOT: clang-offload-bundler, {[[P4]], [[P9]]}, assembler, (device-hip, ) // DASM2-NOT: host // @@ -312,3 +327,117 @@ // NL2-DAG: [[P4:[0-9]+]]: linker, {[[P0]], [[P2]]}, image, (host-[[T]]) // RL2-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]], [[P9]]}, image, (host-[[T]]) + +// Test one gpu architectures up to the preprocessor expansion output phase in device-only +// compilation mode. no bundle. +// +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \ +// RUN: --cuda-device-only -E 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPE,PPEN %s + +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \ +// RUN: --cuda-device-only -E --no-gpu-bundle-output 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPE,PPEN %s + +// Test one gpu architectures up to the preprocessor expansion output phase in device-only +// compilation mode. bundle. + +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \ +// RUN: --cuda-device-only -E --gpu-bundle-output 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPE,PPEB %s + +// Test two gpu architectures up to the preprocessor expansion output phase in device-only +// compilation mode. no bundle. + +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -E 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPE2,PPE2N %s + +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -E --no-gpu-bundle-output 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPE2,PPE2N %s + +// Test two gpu architectures up to the preprocessor expansion output phase in device-only +// compilation mode. bundle. + +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -E --gpu-bundle-output 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPE2,PPE2B %s + +// Test one gpu architectures up to the LLVM IR output phase in device-only +// compilation mode. no bundle. +// +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \ +// RUN: --cuda-device-only -c -emit-llvm 2>&1 \ +// RUN: | FileCheck -check-prefixes=LLVM %s + +// Test two gpu architectures up to the LLVM IR output phase in device-only +// compilation mode. bundle. +// +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -c -emit-llvm -o %t.bc --gpu-bundle-output 2>&1 \ +// RUN: | FileCheck -check-prefixes=LLVM2 %s + +// Test two gpu architectures up to the LLVM IR output phase in device-only +// compilation mode with bundled preprocessor expansion as input. bundle. +// +// RUN: %clang -x hip-cpp-output -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -c -emit-llvm -o %t.bc --gpu-bundle-output 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPELLVM2 %s + +// PPE-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// PPE-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// PPE-DAG: [[P2:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P1]]}, [[T]]-cpp-output +// PPEB-DAG: [[P3:[0-9]+]]: clang-offload-bundler, {[[P2]]}, [[T]]-cpp-output, (device-hip, ) +// PPEN-NOT: clang-offload-bundler, {{.*}}, [[T]]-cpp-output, (device-hip, ) +// PPE-NOT: host + +// PPE2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// PPE2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// PPE2-DAG: [[P2:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P1]]}, [[T]]-cpp-output +// PPE2-DAG: [[P5:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]]) +// PPE2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// PPE2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P6]]}, [[T]]-cpp-output +// PPE2B-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P2]], [[P9]]}, [[T]]-cpp-output, (device-hip, ) +// PPE2N-NOT: clang-offload-bundler, {{.*}}, [[T]]-cpp-output, (device-hip, ) +// PPE2-NOT: host + +// LLVM-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// LLVM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// LLVM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// LLVM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]]) +// LLVM-NOT: clang-offload-bundler +// LLVM-NOT: host + +// LLVM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// LLVM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// LLVM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// LLVM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]]) +// LLVM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, ir +// LLVM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]]) +// LLVM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// LLVM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]]) +// LLVM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, ir, (device-[[T]], [[ARCH2]]) +// LLVM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, ir +// LLVM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, ir, (device-hip, ) +// LLVM2-NOT: host + +// PPELLVM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]]-cpp-output +// PPELLVM2-DAG: [[P1:[0-9]+]]: clang-offload-unbundler, {[[P0]]}, hip-cpp-output +// PPELLVM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH:gfx803]]) +// PPELLVM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]]) +// PPELLVM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, ir +// PPELLVM2-DAG: [[P7:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH2:gfx900]]) +// PPELLVM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, ir, (device-[[T]], [[ARCH2]]) +// PPELLVM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, ir +// PPELLVM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, ir, (device-hip, ) +// PPELLVM2-NOT: host Index: clang/test/Driver/hip-rdc-device-only.hip =================================================================== --- clang/test/Driver/hip-rdc-device-only.hip +++ clang/test/Driver/hip-rdc-device-only.hip @@ -6,7 +6,7 @@ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ // RUN: -c -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ -// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \ // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITBC %s // With `-emit-llvm`, the output should be the same as the aforementioned line @@ -16,14 +16,14 @@ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ // RUN: -c -emit-llvm -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ -// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \ // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITBC %s // RUN: %clang -### -target x86_64-linux-gnu \ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ // RUN: -S -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ -// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \ // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITLL %s // With `-emit-llvm`, the output should be the same as the aforementioned line @@ -33,7 +33,7 @@ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ // RUN: -S -emit-llvm -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ -// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \ // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITLL %s // With `-save-temps`, commane lines for each steps are dumped. For assembly @@ -44,9 +44,17 @@ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ // RUN: -S -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ -// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \ // RUN: 2>&1 | FileCheck -check-prefix=SAVETEMP %s +// Check output one file without bundling cause error. + +// RUN: %clang -### -target x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ +// RUN: -S -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu -o %t.s --no-gpu-bundle-output \ +// RUN: 2>&1 | FileCheck -check-prefix=FAIL %s + // COMMON: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" // COMMON-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" // EMITBC-SAME: "-emit-llvm-bc" @@ -56,8 +64,8 @@ // COMMON-SAME: "-fapply-global-visibility-to-externs" // COMMON-SAME: "-target-cpu" "gfx803" // COMMON-SAME: "-fgpu-rdc" -// EMITBC-SAME: {{.*}} "-o" {{"a.*bc"}} "-x" "hip" -// EMITLL-SAME: {{.*}} "-o" {{"a.*ll"}} "-x" "hip" +// EMITBC-SAME: {{.*}} "-o" {{".*a.*bc"}} "-x" "hip" +// EMITLL-SAME: {{.*}} "-o" {{".*a.*ll"}} "-x" "hip" // CHECK-SAME: {{.*}} {{".*a.cu"}} // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" @@ -69,10 +77,14 @@ // COMMON-SAME: "-fapply-global-visibility-to-externs" // COMMON-SAME: "-target-cpu" "gfx900" // COMMON-SAME: "-fgpu-rdc" -// EMITBC-SAME: {{.*}} "-o" {{"a.*bc"}} "-x" "hip" -// EMITLL-SAME: {{.*}} "-o" {{"a.*ll"}} "-x" "hip" +// EMITBC-SAME: {{.*}} "-o" {{".*a.*bc"}} "-x" "hip" +// EMITLL-SAME: {{.*}} "-o" {{".*a.*ll"}} "-x" "hip" // COMMON-SAME: {{.*}} {{".*a.cu"}} +// COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}" +// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// COMMON-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}" + // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" // COMMON-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" // EMITBC-SAME: "-emit-llvm-bc" @@ -82,8 +94,8 @@ // COMMON-SAME: "-fapply-global-visibility-to-externs" // COMMON-SAME: "-target-cpu" "gfx803" // COMMON-SAME: "-fgpu-rdc" -// EMITBC-SAME: {{.*}} "-o" {{"b.*bc"}} "-x" "hip" -// EMITLL-SAME: {{.*}} "-o" {{"b.*ll"}} "-x" "hip" +// EMITBC-SAME: {{.*}} "-o" {{".*b.*bc"}} "-x" "hip" +// EMITLL-SAME: {{.*}} "-o" {{".*b.*ll"}} "-x" "hip" // COMMON-SAME: {{.*}} {{".*b.hip"}} // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" @@ -95,10 +107,14 @@ // COMMON-SAME: "-fapply-global-visibility-to-externs" // COMMON-SAME: "-target-cpu" "gfx900" // COMMON-SAME: "-fgpu-rdc" -// EMITBC-SAME: {{.*}} "-o" {{"b.*bc"}} "-x" "hip" -// EMITLL-SAME: {{.*}} "-o" {{"b.*ll"}} "-x" "hip" +// EMITBC-SAME: {{.*}} "-o" {{".*b.*bc"}} "-x" "hip" +// EMITLL-SAME: {{.*}} "-o" {{".*b.*ll"}} "-x" "hip" // COMMON-SAME: {{.*}} {{".*b.hip"}} +// COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}" +// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// COMMON-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}" + // SAVETEMP: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu" // SAVETEMP-SAME: "-E" // SAVETEMP-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803" @@ -125,6 +141,10 @@ // SAVETEMP-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900" // SAVETEMP-SAME: {{.*}} "-o" {{"a.*.ll"}} "-x" "ir" [[A_GFX900_TMP_BC]] +// SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll" +// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// SAVETEMP-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.ll" + // SAVETEMP: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu" // SAVETEMP-SAME: "-E" // SAVETEMP-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803" @@ -150,3 +170,9 @@ // SAVETEMP-SAME: "-emit-llvm" // SAVETEMP-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900" // SAVETEMP-SAME: {{.*}} "-o" {{"b.*.ll"}} "-x" "ir" [[B_GFX900_TMP_BC]] + +// SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll" +// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// SAVETEMP-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.ll" + +// FAIL: error: cannot specify -o when generating multiple output files Index: clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp =================================================================== --- clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp +++ clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp @@ -117,6 +117,9 @@ /// The index of the host input in the list of inputs. static unsigned HostInputIndex = ~0u; +/// Whether not having host target is allowed. +static bool AllowNoHost = false; + /// Path to the current binary. static std::string BundlerExecutable; @@ -839,9 +842,10 @@ } // Get the file handler. We use the host buffer as reference. - assert(HostInputIndex != ~0u && "Host input index undefined??"); + assert((HostInputIndex != ~0u || AllowNoHost) && + "Host input index undefined??"); Expected> FileHandlerOrErr = - CreateFileHandler(*InputBuffers[HostInputIndex]); + CreateFileHandler(*InputBuffers[AllowNoHost ? 0 : HostInputIndex]); if (!FileHandlerOrErr) return FileHandlerOrErr.takeError(); @@ -1108,6 +1112,7 @@ // have exactly one host target. unsigned Index = 0u; unsigned HostTargetNum = 0u; + bool HIPOnly = true; llvm::DenseSet ParsedTargets; for (StringRef Target : TargetNames) { if (ParsedTargets.contains(Target)) { @@ -1149,12 +1154,21 @@ HostInputIndex = Index; } + if (Kind != "hip" && Kind != "hipv4") + HIPOnly = false; + ++Index; } + // HIP uses clang-offload-bundler to bundle device-only compilation results + // for multiple GPU archs, therefore allow no host target if all entries + // are for HIP. + AllowNoHost = HIPOnly; + // Host triple is not really needed for unbundling operation, so do not // treat missing host triple as error if we do unbundling. - if ((Unbundle && HostTargetNum > 1) || (!Unbundle && HostTargetNum != 1)) { + if ((Unbundle && HostTargetNum > 1) || + (!Unbundle && HostTargetNum != 1 && !AllowNoHost)) { reportError(createStringError(errc::invalid_argument, "expecting exactly one host target but got " + Twine(HostTargetNum)));