Index: clang/lib/Driver/Driver.cpp =================================================================== --- clang/lib/Driver/Driver.cpp +++ clang/lib/Driver/Driver.cpp @@ -3006,6 +3006,22 @@ A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A, AssociatedOffloadKind); + // Bundle output files for device only compilation if there are more + // than one output files. + if (CompileDeviceOnly && CurPhase == FinalPhase && + GpuArchList.size() > 1) { + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + OffloadAction::DeviceDependences DDep; + DDep.add(*CudaDeviceActions[I], *ToolChains.front(), GpuArchList[I], + AssociatedOffloadKind); + CudaDeviceActions[I] = C.MakeAction( + DDep, CudaDeviceActions[I]->getType()); + } + CudaFatBinary = + C.MakeAction(CudaDeviceActions); + CudaDeviceActions.clear(); + } + return (CompileDeviceOnly && CurPhase == FinalPhase) ? ABRT_Ignore_Host : ABRT_Success; } Index: clang/test/Driver/clang-offload-bundler.c =================================================================== --- clang/test/Driver/clang-offload-bundler.c +++ clang/test/Driver/clang-offload-bundler.c @@ -362,6 +362,21 @@ // CKLST2-NOT: openmp-powerpc64le-ibm-linux-gnu // CKLST2-NOT: openmp-x86_64-pc-linux-gnu +// +// Check bundling without host target is allowed for HIP. +// +// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 \ +// RUN: -inputs=%t.tgt1,%t.tgt2 -outputs=%t.hip.bundle.bc +// RUN: clang-offload-bundler -type=bc -list -inputs=%t.hip.bundle.bc | FileCheck -check-prefix=NOHOST %s +// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 \ +// RUN: -outputs=%t.res.tgt1,%t.res.tgt2 -inputs=%t.hip.bundle.bc -unbundle +// RUN: diff %t.tgt1 %t.res.tgt1 +// RUN: diff %t.tgt2 %t.res.tgt2 +// +// NOHOST-NOT: host- +// NOHOST-DAG: hip-amdgcn-amd-amdhsa-gfx900 +// NOHOST-DAG: hip-amdgcn-amd-amdhsa-gfx906 + // Some code so that we can create a binary out of this file. int A = 0; void test_func(void) { Index: clang/test/Driver/hip-output-file-name.hip =================================================================== --- clang/test/Driver/hip-output-file-name.hip +++ clang/test/Driver/hip-output-file-name.hip @@ -9,6 +9,16 @@ // CHECK: {{.*}}clang-offload-bundler{{.*}}"-outputs=hip-output-file-name.o" // Check -E default output is "-" (stdout). +// If there are multiple preprocessor expansion outputs clang-offload-bundler +// is used to bundle the final output. + +// RUN: %clang -### -E -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=gfx803 %s \ +// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s + +// RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=gfx803 %s \ +// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s // RUN: %clang -### -E -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ @@ -20,7 +30,7 @@ // RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ -// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s +// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s // RUN: %clang -### -E --cuda-host-only -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ @@ -41,7 +51,7 @@ // RUN: %clang -### -E -o test.cui --cuda-device-only -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ -// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-OUT %s +// RUN: 2>&1 | FileCheck -check-prefixes=OUT %s // RUN: %clang -### -E -o test.cui --cuda-host-only -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ Index: clang/test/Driver/hip-phases.hip =================================================================== --- clang/test/Driver/hip-phases.hip +++ clang/test/Driver/hip-phases.hip @@ -238,6 +238,7 @@ // DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) // DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) // DASM-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, assembler +// DASM-NOT: clang-offload-bundler // DASM-NOT: host // @@ -282,6 +283,7 @@ // DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]]) // DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]]) // DASM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, assembler +// DASM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, assembler, (device-hip, ) // DASM2-NOT: host // @@ -312,3 +314,88 @@ // NL2-DAG: [[P4:[0-9]+]]: linker, {[[P0]], [[P2]]}, image, (host-[[T]]) // RL2-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]], [[P9]]}, image, (host-[[T]]) + +// Test one gpu architectures up to the preprocessor expansion output phase in device-only +// compilation mode. +// +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \ +// RUN: --cuda-device-only -E 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPE %s + +// Test two gpu architectures up to the preprocessor expansion output phase in device-only +// compilation mode. +// +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -E 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPE2 %s + +// Test one gpu architectures up to the LLVM IR output phase in device-only +// compilation mode. +// +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \ +// RUN: --cuda-device-only -c -emit-llvm 2>&1 \ +// RUN: | FileCheck -check-prefixes=LLVM %s + +// Test two gpu architectures up to the LLVM IR output phase in device-only +// compilation mode. +// +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -c -emit-llvm 2>&1 \ +// RUN: | FileCheck -check-prefixes=LLVM2 %s + +// Test two gpu architectures up to the LLVM IR output phase in device-only +// compilation mode with bundled preprocessor expansion as input. +// +// RUN: %clang -x hip-cpp-output -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -c -emit-llvm 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPELLVM2 %s + +// PPE-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// PPE-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// PPE-NOT: clang-offload-bundler +// PPE-NOT: host + +// PPE2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// PPE2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// PPE2-DAG: [[P2:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P1]]}, [[T]]-cpp-output +// PPE2-DAG: [[P5:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]]) +// PPE2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// PPE2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P6]]}, [[T]]-cpp-output +// PPE2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P2]], [[P9]]}, [[T]]-cpp-output, (device-hip, ) +// PPE2-NOT: host + +// LLVM-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// LLVM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// LLVM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// LLVM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]]) +// LLVM-NOT: clang-offload-bundler +// LLVM-NOT: host + +// LLVM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// LLVM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// LLVM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// LLVM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]]) +// LLVM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, ir +// LLVM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]]) +// LLVM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// LLVM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]]) +// LLVM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, ir, (device-[[T]], [[ARCH2]]) +// LLVM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, ir +// LLVM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, ir, (device-hip, ) +// LLVM2-NOT: host + +// PPELLVM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]]-cpp-output +// PPELLVM2-DAG: [[P1:[0-9]+]]: clang-offload-unbundler, {[[P0]]}, hip-cpp-output +// PPELLVM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH:gfx803]]) +// PPELLVM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]]) +// PPELLVM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, ir +// PPELLVM2-DAG: [[P7:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH2:gfx900]]) +// PPELLVM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, ir, (device-[[T]], [[ARCH2]]) +// PPELLVM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, ir +// PPELLVM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, ir, (device-hip, ) +// PPELLVM2-NOT: host Index: clang/test/Driver/hip-rdc-device-only.hip =================================================================== --- clang/test/Driver/hip-rdc-device-only.hip +++ clang/test/Driver/hip-rdc-device-only.hip @@ -56,8 +56,8 @@ // COMMON-SAME: "-fapply-global-visibility-to-externs" // COMMON-SAME: "-target-cpu" "gfx803" // COMMON-SAME: "-fgpu-rdc" -// EMITBC-SAME: {{.*}} "-o" {{"a.*bc"}} "-x" "hip" -// EMITLL-SAME: {{.*}} "-o" {{"a.*ll"}} "-x" "hip" +// EMITBC-SAME: {{.*}} "-o" {{".*a.*bc"}} "-x" "hip" +// EMITLL-SAME: {{.*}} "-o" {{".*a.*ll"}} "-x" "hip" // CHECK-SAME: {{.*}} {{".*a.cu"}} // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" @@ -69,10 +69,14 @@ // COMMON-SAME: "-fapply-global-visibility-to-externs" // COMMON-SAME: "-target-cpu" "gfx900" // COMMON-SAME: "-fgpu-rdc" -// EMITBC-SAME: {{.*}} "-o" {{"a.*bc"}} "-x" "hip" -// EMITLL-SAME: {{.*}} "-o" {{"a.*ll"}} "-x" "hip" +// EMITBC-SAME: {{.*}} "-o" {{".*a.*bc"}} "-x" "hip" +// EMITLL-SAME: {{.*}} "-o" {{".*a.*ll"}} "-x" "hip" // COMMON-SAME: {{.*}} {{".*a.cu"}} +// COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}" +// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// COMMON-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}" + // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" // COMMON-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" // EMITBC-SAME: "-emit-llvm-bc" @@ -82,8 +86,8 @@ // COMMON-SAME: "-fapply-global-visibility-to-externs" // COMMON-SAME: "-target-cpu" "gfx803" // COMMON-SAME: "-fgpu-rdc" -// EMITBC-SAME: {{.*}} "-o" {{"b.*bc"}} "-x" "hip" -// EMITLL-SAME: {{.*}} "-o" {{"b.*ll"}} "-x" "hip" +// EMITBC-SAME: {{.*}} "-o" {{".*b.*bc"}} "-x" "hip" +// EMITLL-SAME: {{.*}} "-o" {{".*b.*ll"}} "-x" "hip" // COMMON-SAME: {{.*}} {{".*b.hip"}} // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" @@ -95,10 +99,14 @@ // COMMON-SAME: "-fapply-global-visibility-to-externs" // COMMON-SAME: "-target-cpu" "gfx900" // COMMON-SAME: "-fgpu-rdc" -// EMITBC-SAME: {{.*}} "-o" {{"b.*bc"}} "-x" "hip" -// EMITLL-SAME: {{.*}} "-o" {{"b.*ll"}} "-x" "hip" +// EMITBC-SAME: {{.*}} "-o" {{".*b.*bc"}} "-x" "hip" +// EMITLL-SAME: {{.*}} "-o" {{".*b.*ll"}} "-x" "hip" // COMMON-SAME: {{.*}} {{".*b.hip"}} +// COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}" +// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// COMMON-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}" + // SAVETEMP: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu" // SAVETEMP-SAME: "-E" // SAVETEMP-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803" @@ -125,6 +133,10 @@ // SAVETEMP-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900" // SAVETEMP-SAME: {{.*}} "-o" {{"a.*.ll"}} "-x" "ir" [[A_GFX900_TMP_BC]] +// SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll" +// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// SAVETEMP-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.ll" + // SAVETEMP: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu" // SAVETEMP-SAME: "-E" // SAVETEMP-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803" @@ -150,3 +162,7 @@ // SAVETEMP-SAME: "-emit-llvm" // SAVETEMP-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900" // SAVETEMP-SAME: {{.*}} "-o" {{"b.*.ll"}} "-x" "ir" [[B_GFX900_TMP_BC]] + +// SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll" +// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// SAVETEMP-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.ll" Index: clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp =================================================================== --- clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp +++ clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp @@ -117,6 +117,9 @@ /// The index of the host input in the list of inputs. static unsigned HostInputIndex = ~0u; +/// Whether not having host target is allowed. +static bool AllowNoHost = false; + /// Path to the current binary. static std::string BundlerExecutable; @@ -857,9 +860,10 @@ } // Get the file handler. We use the host buffer as reference. - assert(HostInputIndex != ~0u && "Host input index undefined??"); + assert((HostInputIndex != ~0u || AllowNoHost) && + "Host input index undefined??"); Expected> FileHandlerOrErr = - CreateFileHandler(*InputBuffers[HostInputIndex]); + CreateFileHandler(*InputBuffers[AllowNoHost ? 0 : HostInputIndex]); if (!FileHandlerOrErr) return FileHandlerOrErr.takeError(); @@ -1126,6 +1130,7 @@ // have exactly one host target. unsigned Index = 0u; unsigned HostTargetNum = 0u; + bool HIPOnly = true; llvm::DenseSet ParsedTargets; for (StringRef Target : TargetNames) { if (ParsedTargets.contains(Target)) { @@ -1167,12 +1172,21 @@ HostInputIndex = Index; } + if (Kind != "hip" && Kind != "hipv4") + HIPOnly = false; + ++Index; } + // HIP uses clang-offload-bundler to bundle device-only compilation results + // for multiple GPU archs, therefore allow no host target if all entries + // are for HIP. + AllowNoHost = HIPOnly; + // Host triple is not really needed for unbundling operation, so do not // treat missing host triple as error if we do unbundling. - if ((Unbundle && HostTargetNum > 1) || (!Unbundle && HostTargetNum != 1)) { + if ((Unbundle && HostTargetNum > 1) || + (!Unbundle && HostTargetNum != 1 && !AllowNoHost)) { reportError(createStringError(errc::invalid_argument, "expecting exactly one host target but got " + Twine(HostTargetNum)));