Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -2151,9 +2151,10 @@ } }; - /// CUDA action builder. It injects device code in the host backend - /// action. - class CudaActionBuilder final : public DeviceActionBuilder { + /// Base class for CUDA/HIP action builder. It injects device code in + /// the host backend action. + class CudaActionBuilderBase : public DeviceActionBuilder { + protected: /// Flags to signal if the user requested host-only or device-only /// compilation. bool CompileHostOnly = false; @@ -2170,115 +2171,11 @@ /// Flag that is set to true if this builder acted on the current input. bool IsActive = false; - public: - CudaActionBuilder(Compilation &C, DerivedArgList &Args, - const Driver::InputList &Inputs) - : DeviceActionBuilder(C, Args, Inputs, Action::OFK_Cuda) {} - - ActionBuilderReturnCode - getDeviceDependences(OffloadAction::DeviceDependences &DA, - phases::ID CurPhase, phases::ID FinalPhase, - PhasesTy &Phases) override { - if (!IsActive) - return ABRT_Inactive; - - // If we don't have more CUDA actions, we don't have any dependences to - // create for the host. - if (CudaDeviceActions.empty()) - return ABRT_Success; - - assert(CudaDeviceActions.size() == GpuArchList.size() && - "Expecting one action per GPU architecture."); - assert(!CompileHostOnly && - "Not expecting CUDA actions in host-only compilation."); - - // If we are generating code for the device or we are in a backend phase, - // we attempt to generate the fat binary. We compile each arch to ptx and - // assemble to cubin, then feed the cubin *and* the ptx into a device - // "link" action, which uses fatbinary to combine these cubins into one - // fatbin. The fatbin is then an input to the host action if not in - // device-only mode. - if (CompileDeviceOnly || CurPhase == phases::Backend) { - ActionList DeviceActions; - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { - // Produce the device action from the current phase up to the assemble - // phase. - for (auto Ph : Phases) { - // Skip the phases that were already dealt with. - if (Ph < CurPhase) - continue; - // We have to be consistent with the host final phase. - if (Ph > FinalPhase) - break; - - CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction( - C, Args, Ph, CudaDeviceActions[I], Action::OFK_Cuda); - - if (Ph == phases::Assemble) - break; - } - - // If we didn't reach the assemble phase, we can't generate the fat - // binary. We don't need to generate the fat binary if we are not in - // device-only mode. - if (!isa(CudaDeviceActions[I]) || - CompileDeviceOnly) - continue; - - Action *AssembleAction = CudaDeviceActions[I]; - assert(AssembleAction->getType() == types::TY_Object); - assert(AssembleAction->getInputs().size() == 1); - - Action *BackendAction = AssembleAction->getInputs()[0]; - assert(BackendAction->getType() == types::TY_PP_Asm); - - for (auto &A : {AssembleAction, BackendAction}) { - OffloadAction::DeviceDependences DDep; - DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]), - Action::OFK_Cuda); - DeviceActions.push_back( - C.MakeAction(DDep, A->getType())); - } - } - - // We generate the fat binary if we have device input actions. - if (!DeviceActions.empty()) { - CudaFatBinary = - C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN); - - if (!CompileDeviceOnly) { - DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr, - Action::OFK_Cuda); - // Clear the fat binary, it is already a dependence to an host - // action. - CudaFatBinary = nullptr; - } - - // Remove the CUDA actions as they are already connected to an host - // action or fat binary. - CudaDeviceActions.clear(); - } - - // We avoid creating host action in device-only mode. - return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success; - } else if (CurPhase > phases::Backend) { - // If we are past the backend phase and still have a device action, we - // don't have to do anything as this action is already a device - // top-level action. - return ABRT_Success; - } - - assert(CurPhase < phases::Backend && "Generating single CUDA " - "instructions should only occur " - "before the backend phase!"); - - // By default, we produce an action for each device arch. - for (Action *&A : CudaDeviceActions) - A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A); - - return ABRT_Success; - } + CudaActionBuilderBase(Compilation &C, DerivedArgList &Args, + const Driver::InputList &Inputs, + Action::OffloadKind OFKind) + : DeviceActionBuilder(C, Args, Inputs, OFKind) {} ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override { // While generating code for CUDA, we only depend on the host input action @@ -2317,6 +2214,17 @@ return ABRT_Success; } + // If this is an unbundling action use it as is for each CUDA toolchain. + if (auto *UA = dyn_cast(HostAction)) { + CudaDeviceActions.clear(); + for (auto Arch : GpuArchList) { + CudaDeviceActions.push_back(UA); + UA->registerDependentActionInfo(ToolChains[0], CudaArchToString(Arch), + AssociatedOffloadKind); + } + return ABRT_Success; + } + return IsActive ? ABRT_Success : ABRT_Inactive; } @@ -2325,7 +2233,7 @@ auto AddTopLevel = [&](Action *A, CudaArch BoundArch) { OffloadAction::DeviceDependences Dep; Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch), - Action::OFK_Cuda); + AssociatedOffloadKind); AL.push_back(C.MakeAction(Dep, A->getType())); }; @@ -2354,8 +2262,17 @@ } bool initialize() override { + assert(AssociatedOffloadKind == Action::OFK_Cuda || + AssociatedOffloadKind == Action::OFK_HIP); + // We don't need to support CUDA. - if (!C.hasOffloadToolChain()) + if (AssociatedOffloadKind == Action::OFK_Cuda && + !C.hasOffloadToolChain()) + return false; + + // We don't need to support HIP. + if (AssociatedOffloadKind == Action::OFK_HIP && + !C.hasOffloadToolChain()) return false; const ToolChain *HostTC = C.getSingleOffloadToolChain(); @@ -2370,7 +2287,10 @@ return true; } - ToolChains.push_back(C.getSingleOffloadToolChain()); + ToolChains.push_back( + AssociatedOffloadKind == Action::OFK_Cuda + ? C.getSingleOffloadToolChain() + : C.getSingleOffloadToolChain()); Arg *PartialCompilationArg = Args.getLastArg( options::OPT_cuda_host_only, options::OPT_cuda_device_only, @@ -2423,6 +2343,187 @@ } }; + /// \brief CUDA action builder. It injects device code in the host backend + /// action. + class CudaActionBuilder final : public CudaActionBuilderBase { + public: + CudaActionBuilder(Compilation &C, DerivedArgList &Args, + const Driver::InputList &Inputs) + : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_Cuda) {} + + ActionBuilderReturnCode + getDeviceDependences(OffloadAction::DeviceDependences &DA, + phases::ID CurPhase, phases::ID FinalPhase, + PhasesTy &Phases) override { + if (!IsActive) + return ABRT_Inactive; + + // If we don't have more CUDA actions, we don't have any dependences to + // create for the host. + if (CudaDeviceActions.empty()) + return ABRT_Success; + + assert(CudaDeviceActions.size() == GpuArchList.size() && + "Expecting one action per GPU architecture."); + assert(!CompileHostOnly && + "Not expecting CUDA actions in host-only compilation."); + + // If we are generating code for the device or we are in a backend phase, + // we attempt to generate the fat binary. We compile each arch to ptx and + // assemble to cubin, then feed the cubin *and* the ptx into a device + // "link" action, which uses fatbinary to combine these cubins into one + // fatbin. The fatbin is then an input to the host action if not in + // device-only mode. + if (CompileDeviceOnly || CurPhase == phases::Backend) { + ActionList DeviceActions; + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + // Produce the device action from the current phase up to the assemble + // phase. + for (auto Ph : Phases) { + // Skip the phases that were already dealt with. + if (Ph < CurPhase) + continue; + // We have to be consistent with the host final phase. + if (Ph > FinalPhase) + break; + + CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction( + C, Args, Ph, CudaDeviceActions[I], Action::OFK_Cuda); + + if (Ph == phases::Assemble) + break; + } + + // If we didn't reach the assemble phase, we can't generate the fat + // binary. We don't need to generate the fat binary if we are not in + // device-only mode. + if (!isa(CudaDeviceActions[I]) || + CompileDeviceOnly) + continue; + + Action *AssembleAction = CudaDeviceActions[I]; + assert(AssembleAction->getType() == types::TY_Object); + assert(AssembleAction->getInputs().size() == 1); + + Action *BackendAction = AssembleAction->getInputs()[0]; + assert(BackendAction->getType() == types::TY_PP_Asm); + + for (auto &A : {AssembleAction, BackendAction}) { + OffloadAction::DeviceDependences DDep; + DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]), + Action::OFK_Cuda); + DeviceActions.push_back( + C.MakeAction(DDep, A->getType())); + } + } + + // We generate the fat binary if we have device input actions. + if (!DeviceActions.empty()) { + CudaFatBinary = + C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN); + + if (!CompileDeviceOnly) { + DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr, + Action::OFK_Cuda); + // Clear the fat binary, it is already a dependence to an host + // action. + CudaFatBinary = nullptr; + } + + // Remove the CUDA actions as they are already connected to an host + // action or fat binary. + CudaDeviceActions.clear(); + } + + // We avoid creating host action in device-only mode. + return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success; + } else if (CurPhase > phases::Backend) { + // If we are past the backend phase and still have a device action, we + // don't have to do anything as this action is already a device + // top-level action. + return ABRT_Success; + } + + assert(CurPhase < phases::Backend && "Generating single CUDA " + "instructions should only occur " + "before the backend phase!"); + + // By default, we produce an action for each device arch. + for (Action *&A : CudaDeviceActions) + A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A); + + return ABRT_Success; + } + }; + /// \brief HIP action builder. It injects device code in the host backend + /// action. + class HIPActionBuilder final : public CudaActionBuilderBase { + /// The linker inputs obtained for each device arch. + SmallVector DeviceLinkerInputs; + + public: + HIPActionBuilder(Compilation &C, DerivedArgList &Args, + const Driver::InputList &Inputs) + : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {} + + bool canUseBundlerUnbundler() const override { return true; } + + ActionBuilderReturnCode + getDeviceDependences(OffloadAction::DeviceDependences &DA, + phases::ID CurPhase, phases::ID FinalPhase, + PhasesTy &Phases) override { + // amdgcn does not support linking of object files, therefore we skip + // backend and assemble phases to output LLVM IR. + if (CudaDeviceActions.empty() || CurPhase == phases::Backend || + CurPhase == phases::Assemble) + return ABRT_Success; + + assert((CurPhase == phases::Link || + CudaDeviceActions.size() == GpuArchList.size()) && + "Expecting one action per GPU architecture."); + assert(!CompileHostOnly && + "Not expecting CUDA actions in host-only compilation."); + + // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch. + // This happens to each device action originated from each input file. + // Later on, device actions in DeviceLinkerInputs are used to create + // device link actions in appendLinkDependences and the created device + // link actions are passed to the offload action as device dependence. + if (CurPhase == phases::Link) { + DeviceLinkerInputs.resize(CudaDeviceActions.size()); + auto LI = DeviceLinkerInputs.begin(); + for (auto *A : CudaDeviceActions) { + LI->push_back(A); + ++LI; + } + + // We will pass the device action as a host dependence, so we don't + // need to do anything else with them. + CudaDeviceActions.clear(); + return ABRT_Success; + } + + // By default, we produce an action for each device arch. + for (Action *&A : CudaDeviceActions) + A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A, + AssociatedOffloadKind); + + return ABRT_Success; + } + + void appendLinkDependences(OffloadAction::DeviceDependences &DA) override { + // Append a new link action for each device. + unsigned I = 0; + for (auto &LI : DeviceLinkerInputs) { + auto *DeviceLinkAction = + C.MakeAction(LI, types::TY_Image); + DA.add(*DeviceLinkAction, *ToolChains[0], + CudaArchToString(GpuArchList[I]), AssociatedOffloadKind); + ++I; + } + } + }; + /// OpenMP action builder. The host bitcode is passed to the device frontend /// and all the device linked images are passed to the host link phase. class OpenMPActionBuilder final : public DeviceActionBuilder { @@ -2589,6 +2690,9 @@ // Create a specialized builder for CUDA. SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs)); + // Create a specialized builder for HIP. + SpecializedBuilders.push_back(new HIPActionBuilder(C, Args, Inputs)); + // Create a specialized builder for OpenMP. SpecializedBuilders.push_back(new OpenMPActionBuilder(C, Args, Inputs)); Index: test/Driver/cuda-phases.cu =================================================================== --- test/Driver/cuda-phases.cu +++ test/Driver/cuda-phases.cu @@ -7,195 +7,267 @@ // REQUIRES: clang-driver // REQUIRES: powerpc-registered-target // REQUIRES: nvptx-registered-target - +// REQUIRES: amdgpu-registered-target // // Test single gpu architecture with complete compilation. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \ -// RUN: | FileCheck -check-prefix=BIN %s -// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) -// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) -// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) -// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) -// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) -// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object -// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler -// BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda) -// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir -// BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda) -// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-cuda) -// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-cuda) +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=sm_30 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s +// BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) +// BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) +// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) +// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) +// BIN_NV-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH:sm_30]]) +// BIN_AMD-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH:gfx803]]) +// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]]) +// BIN_NV-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]]) +// BIN_NV-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]]) +// BIN_NV-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P7]]}, object +// BIN_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH]])" {[[P6]]}, assembler +// BIN_NV-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-[[T]]) +// BIN_NV-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P10]]}, ir +// BIN_NV-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) +// BIN_AMD-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) +// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) +// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]]) +// BIN_AMD-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]]) +// BIN_AMD-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]}, +// BIN_AMD-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object // // Test single gpu architecture up to the assemble phase. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \ -// RUN: | FileCheck -check-prefix=ASM %s -// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda) -// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda) -// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-cuda) +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=sm_30 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=ASM,ASM_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s +// ASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]]) +// ASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// ASM_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) +// ASM_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler +// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-[[T]]) +// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (host-[[T]]) +// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-[[T]]) +// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-[[T]]) // // Test two gpu architectures with complete compilation. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ -// RUN: | FileCheck -check-prefix=BIN2 %s -// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) -// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) -// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) -// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) -// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) -// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object -// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler -// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35) -// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35) -// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35) -// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35) -// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object -// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler -// BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda) -// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir -// BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda) -// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-cuda) -// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-cuda) +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN2,BIN2_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN2,BIN2_AMD %s +// BIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) +// BIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) +// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) +// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) +// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH1:sm_30|gfx803]]) +// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]]) +// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH1]]) +// BIN2_NV-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH1]]) +// BIN2_NV-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH1]]) +// BIN2_NV-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH1]])" {[[P7]]}, object +// BIN2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH1]])" {[[P6]]}, assembler +// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]]) +// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-[[T]], [[ARCH2]]) +// BIN2_NV-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-[[T]], [[ARCH2]]) +// BIN2_NV-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-[[T]], [[ARCH2]]) +// BIN2_NV-DAG: [[P15:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P14]]}, object +// BIN2_NV-DAG: [[P16:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P13]]}, assembler +// BIN2_NV-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-[[T]]) +// BIN2_NV-DAG: [[P18:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P17]]}, ir +// BIN2_NV-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-[[T]]) +// BIN2_AMD-DAG: [[P19:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) +// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-[[T]]) +// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-[[T]]) +// BIN2_AMD-DAG: [[P22:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH1]]) +// BIN2_AMD-DAG: [[P23:[0-9]+]]: linker, {[[P12]]}, image, (device-[[T]], [[ARCH2]]) +// BIN2_AMD-DAG: [[P24:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P21]]}, +// BIN2_AMD-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P22]]}, +// BIN2_AMD-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH2]])" {[[P23]]}, object // // Test two gpu architecturess up to the assemble phase. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \ -// RUN: | FileCheck -check-prefix=ASM2 %s -// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) -// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) -// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) -// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler -// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda) -// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda) -// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-cuda) +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=ASM2,ASM2_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=ASM2,ASM2_AMD %s +// ASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH1:sm_30]]) +// ASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH1:gfx803]]) +// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]]) +// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH1]]) +// ASM2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH1]]) +// ASM2_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P3]]}, assembler +// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]]) +// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]]) +// ASM2_NV-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]]) +// ASM2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler +// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-[[T]]) +// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (host-[[T]]) +// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-[[T]]) +// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-[[T]]) // // Test single gpu architecture with complete compilation in host-only // compilation mode. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \ -// RUN: | FileCheck -check-prefix=HBIN %s -// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) -// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) -// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda) -// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda) +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefixes=HBIN,HBIN_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefixes=HBIN,HBIN_AMD %s +// HBIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) +// HBIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) +// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) +// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) +// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) +// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]]) +// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]]) // // Test single gpu architecture up to the assemble phase in host-only // compilation mode. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \ -// RUN: | FileCheck -check-prefix=HASM %s -// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) -// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=HASM,HASM_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=HASM,HASM_AMD %s +// HASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) +// HASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) +// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) +// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) +// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) // // Test two gpu architectures with complete compilation in host-only // compilation mode. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \ -// RUN: | FileCheck -check-prefix=HBIN2 %s -// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) -// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) -// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda) -// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda) +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefixes=HBIN2,HBIN2_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefixes=HBIN2,HBIN2_AMD %s +// HBIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) +// HBIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) +// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) +// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) +// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) +// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]]) +// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]]) // // Test two gpu architectures up to the assemble phase in host-only // compilation mode. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \ -// RUN: | FileCheck -check-prefix=HASM2 %s -// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) -// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S \ +// RUN: 2>&1 | FileCheck -check-prefixes=HASM2,HASM2_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only -S \ +// RUN: 2>&1 | FileCheck -check-prefixes=HASM2,HASM2_AMD %s +// HASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) +// HASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) +// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) +// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) +// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) // // Test single gpu architecture with complete compilation in device-only // compilation mode. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \ -// RUN: | FileCheck -check-prefix=DBIN %s -// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) -// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefixes=DBIN,DBIN_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefixes=DBIN,DBIN_AMD %s +// DBIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]]) +// DBIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// DBIN_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) +// DBIN_NV-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]]) +// DBIN_NV-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] (nvptx64-nvidia-cuda:[[ARCH]])" {[[P4]]}, object // // Test single gpu architecture up to the assemble phase in device-only // compilation mode. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \ -// RUN: | FileCheck -check-prefix=DASM %s -// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=DASM,DASM_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=DASM,DASM_AMD %s +// DASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]]) +// DASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// DASM_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) +// DASM_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler // // Test two gpu architectures with complete compilation in device-only // compilation mode. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \ -// RUN: | FileCheck -check-prefix=DBIN2 %s -// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) -// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object -// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35) -// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35) -// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35) -// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35) -// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefixes=DBIN2,DBIN2_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only \ +// RUN: 2>&1 | FileCheck -check-prefixes=DBIN2,DBIN2_AMD %s +// DBIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]]) +// DBIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// DBIN2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) +// DBIN2_NV-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]]) +// DBIN2_NV-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P4]]}, object +// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]]) +// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-[[T]], [[ARCH2]]) +// DBIN2_NV-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-[[T]], [[ARCH2]]) +// DBIN2_NV-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-[[T]], [[ARCH2]]) +// DBIN2_NV-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P10]]}, object // // Test two gpu architectures up to the assemble phase in device-only // compilation mode. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \ -// RUN: | FileCheck -check-prefix=DASM2 %s -// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) -// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) -// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) -// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \ +// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S \ +// RUN: 2>&1 | FileCheck -check-prefixes=DASM2,DASM2_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=DASM2,DASM2_AMD %s +// DASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]]) +// DASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// DASM2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) +// DASM2_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler +// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]]) +// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]]) +// DASM2_NV-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]]) +// DASM2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler