Index: include/clang/Basic/DiagnosticDriverKinds.td =================================================================== --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -38,7 +38,7 @@ "but installation at %3 is %4. Use --cuda-path to specify a different CUDA " "install, pass a different GPU arch with --cuda-gpu-arch, or pass " "--no-cuda-version-check.">; -def err_drv_cuda_nvptx_host : Error<"unsupported use of NVPTX for host compilation.">; +def err_drv_cuda_host_arch : Error<"unsupported architecture '%0' for host compilation.">; def err_drv_invalid_thread_model_for_target : Error< "invalid thread model '%0' in '%1' for this target">; def err_drv_invalid_linker_name : Error< Index: include/clang/Driver/Action.h =================================================================== --- include/clang/Driver/Action.h +++ include/clang/Driver/Action.h @@ -88,6 +88,7 @@ // The device offloading tool chains - one bit for each programming model. OFK_Cuda = 0x02, OFK_OpenMP = 0x04, + OFK_HIP = 0x08, }; static const char *getClassName(ActionClass AC); Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -547,7 +547,7 @@ HelpText<"Compile CUDA code for both host and device (default). Has no " "effect on non-CUDA compilations.">; def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>, - HelpText<"CUDA GPU architecture (e.g. sm_35). May be specified more than once.">; + HelpText<"CUDA/HIP GPU architecture (e.g. sm_35). May be specified more than once.">; def no_cuda_gpu_arch_EQ : Joined<["--"], "no-cuda-gpu-arch=">, Flags<[DriverOption]>, HelpText<"Remove GPU architecture (e.g. sm_35) from the list of GPUs to compile for. " "'all' resets the list to its default value.">; @@ -572,6 +572,8 @@ def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option, HelpHidden]>, HelpText<"Generate relocatable device code, also known as separate compilation mode.">; def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">; +def hip_device_lib_path_EQ : Joined<["--"], "hip-device-lib-path=">, Group, + HelpText<"HIP device library path">; def dA : Flag<["-"], "dA">, Group; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, HelpText<"Print macro definitions in -E mode in addition to normal output">; Index: include/clang/Driver/ToolChain.h =================================================================== --- include/clang/Driver/ToolChain.h +++ include/clang/Driver/ToolChain.h @@ -121,11 +121,13 @@ path_list ProgramPaths; mutable std::unique_ptr Clang; + mutable std::unique_ptr DeviceLibraryLink; mutable std::unique_ptr Assemble; mutable std::unique_ptr Link; mutable std::unique_ptr OffloadBundler; Tool *getClang() const; + Tool *getDeviceLibraryLink() const; Tool *getAssemble() const; Tool *getLink() const; Tool *getClangAs() const; @@ -151,6 +153,7 @@ void setTripleEnvironment(llvm::Triple::EnvironmentType Env); virtual Tool *buildAssembler() const; + virtual Tool *buildDeviceLibraryLinker() const; virtual Tool *buildLinker() const; virtual Tool *getTool(Action::ActionClass AC) const; Index: lib/Driver/Action.cpp =================================================================== --- lib/Driver/Action.cpp +++ lib/Driver/Action.cpp @@ -96,6 +96,8 @@ return "device-cuda"; case OFK_OpenMP: return "device-openmp"; + case OFK_HIP: + return "device-hip"; // TODO: Add other programming models here. } @@ -104,8 +106,13 @@ return {}; std::string Res("host"); + assert(!((ActiveOffloadKindMask & OFK_Cuda) && + (ActiveOffloadKindMask & OFK_HIP)) && + "Cannot offload CUDA and HIP at the same time"); if (ActiveOffloadKindMask & OFK_Cuda) Res += "-cuda"; + if (ActiveOffloadKindMask & OFK_HIP) + Res += "-hip"; if (ActiveOffloadKindMask & OFK_OpenMP) Res += "-openmp"; @@ -142,6 +149,8 @@ return "cuda"; case OFK_OpenMP: return "openmp"; + case OFK_HIP: + return "hip"; // TODO: Add other programming models here. } Index: lib/Driver/Compilation.cpp =================================================================== --- lib/Driver/Compilation.cpp +++ lib/Driver/Compilation.cpp @@ -196,10 +196,10 @@ if (FailingCommands.empty()) return false; - // CUDA can have the same input source code compiled multiple times so do not - // compiled again if there are already failures. It is OK to abort the CUDA - // pipeline on errors. - if (A->isOffloading(Action::OFK_Cuda)) + // CUDA/HIP can have the same input source code compiled multiple times so do + // not compiled again if there are already failures. It is OK to abort the + // CUDA pipeline on errors. + if (A->isOffloading(Action::OFK_Cuda) || A->isOffloading(Action::OFK_HIP)) return true; for (const auto &CI : FailingCommands) Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -535,24 +535,41 @@ InputList &Inputs) { // - // CUDA + // CUDA/HIP // // We need to generate a CUDA toolchain if any of the inputs has a CUDA type. - if (llvm::any_of(Inputs, [](std::pair &I) { - return types::isCuda(I.first); - })) { + // ToDo: Handle mixed CUDA/HIP input files and -x hip option. Diagnose + // CUDA on amdgcn and HIP on nvptx. + bool IsHIP = + C.getInputArgs().hasArg(options::OPT_x) && + StringRef(C.getInputArgs().getLastArg(options::OPT_x)->getValue()) == + "hip"; + if (llvm::any_of(Inputs, + [](std::pair &I) { + return types::isCuda(I.first); + }) || + IsHIP) { const ToolChain *HostTC = C.getSingleOffloadToolChain(); const llvm::Triple &HostTriple = HostTC->getTriple(); - llvm::Triple CudaTriple(HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda" - : "nvptx-nvidia-cuda"); - // Use the CUDA and host triples as the key into the ToolChains map, because - // the device toolchain we create depends on both. + StringRef DeviceTripleStr; + auto OFK = IsHIP ? Action::OFK_HIP : Action::OFK_Cuda; + if (IsHIP) { + // HIP is only supported on amdgcn. + DeviceTripleStr = "amdgcn-amd-amdhsa"; + } else { + // CUDA is only supported on nvptx. + DeviceTripleStr = HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda" + : "nvptx-nvidia-cuda"; + } + llvm::Triple CudaTriple(DeviceTripleStr); + // Use the CUDA/HIP and host triples as the key into the ToolChains map, + // because the device toolchain we create depends on both. auto &CudaTC = ToolChains[CudaTriple.str() + "/" + HostTriple.str()]; if (!CudaTC) { CudaTC = llvm::make_unique( - *this, CudaTriple, *HostTC, C.getInputArgs(), Action::OFK_Cuda); + *this, CudaTriple, *HostTC, C.getInputArgs(), OFK); } - C.addOffloadDeviceToolChain(CudaTC.get(), Action::OFK_Cuda); + C.addOffloadDeviceToolChain(CudaTC.get(), OFK); } // @@ -2120,9 +2137,9 @@ } }; - /// \brief CUDA action builder. It injects device code in the host backend - /// action. - class CudaActionBuilder final : public DeviceActionBuilder { + /// \brief base class for CUDA/HIP action builder. It injects device code in + /// the host backend action. + class CudaActionBuilderBase : public DeviceActionBuilder { /// Flags to signal if the user requested host-only or device-only /// compilation. bool CompileHostOnly = false; @@ -2140,10 +2157,14 @@ /// Flag that is set to true if this builder acted on the current input. bool IsActive = false; + /// The offload kind for CUDA/HIP. + Action::OffloadKind OFK; + public: - CudaActionBuilder(Compilation &C, DerivedArgList &Args, - const Driver::InputList &Inputs) - : DeviceActionBuilder(C, Args, Inputs, Action::OFK_Cuda) {} + CudaActionBuilderBase(Compilation &C, DerivedArgList &Args, + const Driver::InputList &Inputs, + Action::OffloadKind OFKind) + : DeviceActionBuilder(C, Args, Inputs, OFKind), OFK(OFKind) {} ActionBuilderReturnCode getDeviceDependences(OffloadAction::DeviceDependences &DA, @@ -2182,7 +2203,7 @@ break; CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction( - C, Args, Ph, CudaDeviceActions[I], Action::OFK_Cuda); + C, Args, Ph, CudaDeviceActions[I], OFK); if (Ph == phases::Assemble) break; @@ -2205,7 +2226,7 @@ for (auto &A : {AssembleAction, BackendAction}) { OffloadAction::DeviceDependences DDep; DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]), - Action::OFK_Cuda); + OFK); DeviceActions.push_back( C.MakeAction(DDep, A->getType())); } @@ -2218,7 +2239,7 @@ if (!CompileDeviceOnly) { DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr, - Action::OFK_Cuda); + OFK); // Clear the fat binary, it is already a dependence to an host // action. CudaFatBinary = nullptr; @@ -2293,8 +2314,7 @@ // Utility to append actions to the top level list. auto AddTopLevel = [&](Action *A, CudaArch BoundArch) { OffloadAction::DeviceDependences Dep; - Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch), - Action::OFK_Cuda); + Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch), OFK); AL.push_back(C.MakeAction(Dep, A->getType())); }; @@ -2323,21 +2343,32 @@ } bool initialize() override { + assert(OFK == Action::OFK_Cuda || OFK == Action::OFK_HIP); + // We don't need to support CUDA. - if (!C.hasOffloadToolChain()) + if (OFK == Action::OFK_Cuda && !C.hasOffloadToolChain()) + return false; + + // We don't need to support HIP. + if (OFK == Action::OFK_HIP && !C.hasOffloadToolChain()) return false; const ToolChain *HostTC = C.getSingleOffloadToolChain(); assert(HostTC && "No toolchain for host compilation."); - if (HostTC->getTriple().isNVPTX()) { - // We do not support targeting NVPTX for host compilation. Throw + if (HostTC->getTriple().isNVPTX() || + HostTC->getTriple().getArch() == llvm::Triple::amdgcn) { + // We do not support targeting NVPTX/AMDGCN for host compilation. Throw // an error and abort pipeline construction early so we don't trip // asserts that assume device-side compilation. - C.getDriver().Diag(diag::err_drv_cuda_nvptx_host); + C.getDriver().Diag(diag::err_drv_cuda_host_arch) + << HostTC->getTriple().getArchName(); return true; } - ToolChains.push_back(C.getSingleOffloadToolChain()); + ToolChains.push_back( + OFK == Action::OFK_Cuda + ? C.getSingleOffloadToolChain() + : C.getSingleOffloadToolChain()); Arg *PartialCompilationArg = Args.getLastArg( options::OPT_cuda_host_only, options::OPT_cuda_device_only, @@ -2390,6 +2421,24 @@ } }; + /// \brief CUDA action builder. It injects device code in the host backend + /// action. + class CudaActionBuilder final : public CudaActionBuilderBase { + public: + CudaActionBuilder(Compilation &C, DerivedArgList &Args, + const Driver::InputList &Inputs) + : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_Cuda) {} + }; + + /// \brief HIP action builder. It injects device code in the host backend + /// action. + class HIPActionBuilder final : public CudaActionBuilderBase { + public: + HIPActionBuilder(Compilation &C, DerivedArgList &Args, + const Driver::InputList &Inputs) + : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {} + }; + /// OpenMP action builder. The host bitcode is passed to the device frontend /// and all the device linked images are passed to the host link phase. class OpenMPActionBuilder final : public DeviceActionBuilder { @@ -2556,6 +2605,9 @@ // Create a specialized builder for CUDA. SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs)); + // Create a specialized builder for HIP. + SpecializedBuilders.push_back(new HIPActionBuilder(C, Args, Inputs)); + // Create a specialized builder for OpenMP. SpecializedBuilders.push_back(new OpenMPActionBuilder(C, Args, Inputs)); @@ -3227,6 +3279,9 @@ bool SaveTemps; bool EmbedBitcode; + /// Has LLVM IR input files (either bitcode or LLVM assembly). + bool HasIRInputs; + /// Get previous dependent action or null if that does not exist. If /// \a CanBeCollapsed is false, that action must be legal to collapse or /// null will be returned. @@ -3284,7 +3339,7 @@ bool canCollapsePreprocessorAction() const { return !C.getArgs().hasArg(options::OPT_no_integrated_cpp) && !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps && - !C.getArgs().hasArg(options::OPT_rewrite_objc); + !HasIRInputs && !C.getArgs().hasArg(options::OPT_rewrite_objc); } /// Struct that relates an action with the offload actions that would be @@ -3329,6 +3384,9 @@ if (!AJ || !BJ || !CJ) return nullptr; + if (AJ->isOffloading(Action::OFK_HIP)) + return nullptr; + // Get compiler tool. const Tool *T = TC.SelectTool(*CJ); if (!T) @@ -3360,6 +3418,9 @@ if (!AJ || !BJ) return nullptr; + if (AJ->isOffloading(Action::OFK_HIP)) + return nullptr; + // Retrieve the compile job, backend action must always be preceded by one. ActionList CompileJobOffloadActions; auto *CJ = getPrevDependentAction(BJ->getInputs(), CompileJobOffloadActions, @@ -3393,6 +3454,16 @@ if (!BJ || !CJ) return nullptr; + // Cannot combine compilation with backend for HIP. However + // it is necessary to combine when generating IR for compile-only with + // flags "-c -S -emit-llvm". If only flags "-c -S" DeviceLibraryLink is + // needed to generate linked and opt IR for assembler, so do not combine. + if (BJ->isOffloading(Action::OFK_HIP) && + !(C.getArgs().hasArg(options::OPT_c) && + C.getArgs().hasArg(options::OPT_S) && + C.getArgs().hasArg(options::OPT_emit_llvm))) + return nullptr; + // Get compiler tool. const Tool *T = TC.SelectTool(*CJ); if (!T) @@ -3436,6 +3507,18 @@ EmbedBitcode(EmbedBitcode) { assert(BaseAction && "Invalid base action."); IsHostSelector = BaseAction->getOffloadingDeviceKind() == Action::OFK_None; + // Check whether there are LLVM IR input files. + HasIRInputs = false; + for (Arg *A : C.getInputArgs()) { + if (A->getOption().getKind() == Option::InputClass) { + auto Ext = llvm::sys::path::extension(A->getValue()); + if (!Ext.empty()) { + auto InputType = TC.LookupTypeForExtension(Ext.drop_front()); + HasIRInputs = + InputType == types::TY_LLVM_IR || InputType == types::TY_LLVM_BC; + } + } + } } /// Check if a chain of actions can be combined and return the tool that can Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -253,6 +253,10 @@ return Clang.get(); } +Tool *ToolChain::buildDeviceLibraryLinker() const { + return new tools::Clang(*this); +} + Tool *ToolChain::buildAssembler() const { return new tools::ClangAs(*this); } @@ -267,6 +271,12 @@ return Assemble.get(); } +Tool *ToolChain::getDeviceLibraryLink() const { + if (!DeviceLibraryLink) + DeviceLibraryLink.reset(buildDeviceLibraryLinker()); + return DeviceLibraryLink.get(); +} + Tool *ToolChain::getClangAs() const { if (!Assemble) Assemble.reset(new tools::ClangAs(*this)); @@ -307,8 +317,9 @@ case Action::AnalyzeJobClass: case Action::MigrateJobClass: case Action::VerifyPCHJobClass: - case Action::BackendJobClass: return getClang(); + case Action::BackendJobClass: + return getDeviceLibraryLink(); case Action::OffloadBundlingJobClass: case Action::OffloadUnbundlingJobClass: @@ -406,8 +417,17 @@ } Tool *ToolChain::SelectTool(const JobAction &JA) const { - if (getDriver().ShouldUseClangCompiler(JA)) return getClang(); Action::ActionClass AC = JA.getKind(); + if (JA.isOffloading(Action::OFK_HIP) && + (AC == Action::BackendJobClass)) { + if ((Args.hasArg(options::OPT_emit_llvm)) || + (Args.hasArg(options::OPT_emit_llvm_bc))) + return getClang(); + else + return getTool(AC); + }; + if (getDriver().ShouldUseClangCompiler(JA)) + return getClang(); if (AC == Action::AssembleJobClass && useIntegratedAs()) return getClangAs(); return getTool(AC); Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -132,6 +132,11 @@ else if (JA.isDeviceOffloading(Action::OFK_Cuda)) Work(*C.getSingleOffloadToolChain()); + if (JA.isHostOffloading(Action::OFK_HIP)) + Work(*C.getSingleOffloadToolChain()); + else if (JA.isDeviceOffloading(Action::OFK_HIP)) + Work(*C.getSingleOffloadToolChain()); + if (JA.isHostOffloading(Action::OFK_OpenMP)) { auto TCs = C.getOffloadToolChains(); for (auto II = TCs.first, IE = TCs.second; II != IE; ++II) @@ -3091,13 +3096,14 @@ // Check number of inputs for sanity. We need at least one input. assert(Inputs.size() >= 1 && "Must have at least one input."); const InputInfo &Input = Inputs[0]; - // CUDA compilation may have multiple inputs (source file + results of + // CUDA/HIP compilation may have multiple inputs (source file + results of // device-side compilations). OpenMP device jobs also take the host IR as a // second input. All other jobs are expected to have exactly one // input. bool IsCuda = JA.isOffloading(Action::OFK_Cuda); + bool IsHIP = JA.isOffloading(Action::OFK_HIP); bool IsOpenMPDevice = JA.isDeviceOffloading(Action::OFK_OpenMP); - assert((IsCuda || (IsOpenMPDevice && Inputs.size() == 2) || + assert((IsCuda || IsHIP || (IsOpenMPDevice && Inputs.size() == 2) || Inputs.size() == 1) && "Unable to handle multiple inputs."); @@ -3109,10 +3115,10 @@ bool IsWindowsMSVC = RawTriple.isWindowsMSVCEnvironment(); bool IsIAMCU = RawTriple.isOSIAMCU(); - // Adjust IsWindowsXYZ for CUDA compilations. Even when compiling in device - // mode (i.e., getToolchain().getTriple() is NVPTX, not Windows), we need to - // pass Windows-specific flags to cc1. - if (IsCuda) { + // Adjust IsWindowsXYZ for CUDA/HIP compilations. Even when compiling in + // device mode (i.e., getToolchain().getTriple() is NVPTX/AMDGCN, not + // Windows), we need to pass Windows-specific flags to cc1. + if (IsCuda || IsHIP) { IsWindowsMSVC |= AuxTriple && AuxTriple->isWindowsMSVCEnvironment(); IsWindowsGNU |= AuxTriple && AuxTriple->isWindowsGNUEnvironment(); IsWindowsCygnus |= AuxTriple && AuxTriple->isWindowsCygwinEnvironment(); @@ -3136,18 +3142,21 @@ Args.ClaimAllArgs(options::OPT_MJ); } - if (IsCuda) { - // We have to pass the triple of the host if compiling for a CUDA device and - // vice-versa. + if (IsCuda || IsHIP) { + // We have to pass the triple of the host if compiling for a CUDA/HIP device + // and vice-versa. std::string NormalizedTriple; - if (JA.isDeviceOffloading(Action::OFK_Cuda)) + if (JA.isDeviceOffloading(Action::OFK_Cuda) || + JA.isDeviceOffloading(Action::OFK_HIP)) NormalizedTriple = C.getSingleOffloadToolChain() ->getTriple() .normalize(); else - NormalizedTriple = C.getSingleOffloadToolChain() - ->getTriple() - .normalize(); + NormalizedTriple = + (IsCuda ? C.getSingleOffloadToolChain() + : C.getSingleOffloadToolChain()) + ->getTriple() + .normalize(); CmdArgs.push_back("-aux-triple"); CmdArgs.push_back(Args.MakeArgString(NormalizedTriple)); @@ -4686,9 +4695,10 @@ CmdArgs.push_back(Args.MakeArgString(Flags)); } - if (IsCuda) { - // Host-side cuda compilation receives all device-side outputs in a single - // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary. + if (IsCuda || IsHIP) { + // Host-side cuda/HIP compilation receives all device-side outputs in a + // single fatbin as Inputs[1]. Include the binary with + // -fcuda-include-gpubinary. if (Inputs.size() > 1) { assert(Inputs.size() == 2 && "More than one GPU binary!"); CmdArgs.push_back("-fcuda-include-gpubinary"); Index: lib/Driver/ToolChains/Cuda.h =================================================================== --- lib/Driver/ToolChains/Cuda.h +++ lib/Driver/ToolChains/Cuda.h @@ -127,6 +127,53 @@ }; } // end namespace NVPTX + +namespace AMDGCN { +// Run llc, the AMDGPU assembler. +class LLVM_LIBRARY_VISIBILITY Assembler : public Tool { +public: + Assembler(const ToolChain &TC) + : Tool("AMDGCN::Assembler", "llc", TC, RF_Full, llvm::sys::WEM_UTF8, + "--options-file") {} + + bool hasIntegratedCPP() const override { return false; } + + void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; + +// Runs clang-offload-bundler, which combines AMDGCN object files into a single +// output file. +class LLVM_LIBRARY_VISIBILITY Linker : public Tool { +public: + Linker(const ToolChain &TC) + : Tool("AMDGCN::Linker", "clang-offload-bundler", TC, RF_Full, + llvm::sys::WEM_UTF8, "--options-file") {} + + bool hasIntegratedCPP() const override { return false; } + + void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; + +// For amdgcn the device library linker is llvm-link + opt. +class LLVM_LIBRARY_VISIBILITY DeviceLibraryLinker : public Tool { +public: + DeviceLibraryLinker(const ToolChain &TC) + : Tool("AMDGCN::DeviceLibraryLinker", "device-library-linker", TC, + RF_Full, llvm::sys::WEM_UTF8, "--options-file") {} + virtual bool hasIntegratedCPP() const override { return false; } + virtual void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; +} // end namespace AMDGCN } // end namespace tools namespace toolchains { @@ -184,6 +231,7 @@ CudaInstallationDetector CudaInstallation; protected: + Tool *buildDeviceLibraryLinker() const override; // for amdgcn, link and opt Tool *buildAssembler() const override; // ptxas Tool *buildLinker() const override; // fatbinary (ok, not really a linker) Index: lib/Driver/ToolChains/Cuda.cpp =================================================================== --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -311,6 +311,201 @@ << CudaVersionToString(Version) << "\n"; } +static bool addBCLib(Compilation &C, const ArgList &Args, + ArgStringList &CmdArgs, ArgStringList LibraryPaths, + StringRef BCName) { + std::string FullName; + bool FoundLibDevice = false; + for (std::string LibraryPath : LibraryPaths) { + SmallString<128> Path(LibraryPath); + llvm::sys::path::append(Path, BCName); + FullName = Args.MakeArgString(Path); + if (llvm::sys::fs::exists(FullName.c_str())) { + FoundLibDevice = true; + break; + } + } + if (!FoundLibDevice) + C.getDriver().Diag(diag::err_drv_no_such_file) << BCName; + CmdArgs.push_back(Args.MakeArgString(FullName)); + return FoundLibDevice; +} + +void AMDGCN::Assembler::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + const auto &TC = + static_cast(getToolChain()); + assert(TC.getTriple().getArch() == llvm::Triple::amdgcn && "Wrong platform"); + + ArgStringList CmdArgs; + for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); + it != ie; ++it) { + const InputInfo &II = *it; + CmdArgs.push_back(II.getFilename()); + } + CmdArgs.push_back("-mtriple=amdgcn-amd-amdhsa"); + CmdArgs.push_back("-filetype=obj"); + std::string GFXNAME = JA.getOffloadingArch(); + CmdArgs.push_back(Args.MakeArgString("-mcpu=" + GFXNAME)); + CmdArgs.push_back("-o"); + std::string TmpName = C.getDriver().GetTemporaryPath("LLC_OUTPUT", "o"); + const char *llcOutputFile = + C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str())); + CmdArgs.push_back(llcOutputFile); + SmallString<128> llcPath(C.getDriver().Dir); + llvm::sys::path::append(llcPath, "llc"); + const char *Exec = Args.MakeArgString(llcPath); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + ArgStringList CmdArgs2; + // The output from ld.lld is an HSA code object file. + CmdArgs2.append({"-flavor", "gnu", "--no-undefined", "-shared", "-o"}); + CmdArgs2.push_back(Output.getFilename()); + CmdArgs2.push_back(llcOutputFile); + SmallString<128> lldPath(C.getDriver().Dir); + llvm::sys::path::append(lldPath, "lld"); + const char *lld = Args.MakeArgString(lldPath); + C.addCommand(llvm::make_unique(JA, *this, lld, CmdArgs2, Inputs)); + return; +} + +void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + const auto &TC = + static_cast(getToolChain()); + assert(TC.getTriple().getArch() == llvm::Triple::amdgcn && "Wrong platform"); + + ArgStringList CmdArgs; + CmdArgs.push_back(Args.MakeArgString("-type=o")); + + // ToDo: Remove the dummy host binary entry which is required by + // clang-offload-bundler. + std::string targets = "-targets=host-x86_64-uknown-linux"; + std::string inputs = "-inputs=/dev/null"; + for (const auto &II : Inputs) { + if (II.getType() != types::TY_PP_Asm) { + // ToDo: Teach clang-offload-bundler to recognize hip. + targets = targets + ",openmp-amdgcn--amdhsa-" + + StringRef(II.getAction()->getOffloadingArch()).str(); + inputs = inputs + "," + II.getFilename(); + } + } + CmdArgs.push_back(Args.MakeArgString(targets)); + CmdArgs.push_back(Args.MakeArgString(inputs)); + + auto outputArgString = + Args.MakeArgString(std::string("-outputs=").append(Output.getFilename())); + CmdArgs.push_back(outputArgString); + + SmallString<128> ExecPath(C.getDriver().Dir); + llvm::sys::path::append(ExecPath, "clang-offload-bundler"); + const char *Exec = Args.MakeArgString(ExecPath); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + return; +} + +void AMDGCN::DeviceLibraryLinker::ConstructJob( + Compilation &C, const JobAction &JA, const InputInfo &Output, + const InputInfoList &Inputs, const ArgList &Args, + const char *LinkingOutput) const { + + assert(StringRef(JA.getOffloadingArch()).startswith("gfx") && + " unless gfx processor, backend should be clang"); + + // For amdgcn the DeviceLibraryLinker Job will call llvm-link & opt steps. + ArgStringList CmdArgs; + // Add the input bc's created by compile step. + for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); + it != ie; ++it) { + const InputInfo &II = *it; + CmdArgs.push_back(II.getFilename()); + } + + std::string GFXNAME = JA.getOffloadingArch(); + + ArgStringList LibraryPaths; + + // Find in --hip-device-lib-path and HIP_LIBRARY_PATH. + for (auto Arg : Args) { + if (Arg->getSpelling() == "--hip-device-lib-path=") { + LibraryPaths.push_back(Args.MakeArgString(Arg->getValue())); + } + } + + addDirectoryList(Args, LibraryPaths, "-L", "HIP_DEVICE_LIB_PATH"); + + addBCLib(C, Args, CmdArgs, LibraryPaths, "libhiprt.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "opencl.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "ockl.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "irif.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "ocml.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_finite_only_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_daz_opt_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, + "oclc_correctly_rounded_sqrt_on.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_unsafe_math_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "hc.amdgcn.bc"); + // Drop gfx in GFXNAME. + addBCLib(C, Args, CmdArgs, LibraryPaths, + (Twine("oclc_isa_version_") + StringRef(GFXNAME).drop_front(3) + + ".amdgcn.bc") + .str()); + + // Add an intermediate output file which is input to opt + CmdArgs.push_back("-o"); + std::string TmpName = C.getDriver().GetTemporaryPath("OPT_INPUT", "bc"); + const char *ResultingBitcodeF = + C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str())); + CmdArgs.push_back(ResultingBitcodeF); + SmallString<128> ExecPath(C.getDriver().Dir); + llvm::sys::path::append(ExecPath, "llvm-link"); + const char *Exec = Args.MakeArgString(ExecPath); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + ArgStringList OptArgs; + // The input to opt is the output from llvm-link. + OptArgs.push_back(ResultingBitcodeF); + // Pass optimization arg to opt. + if (Arg *A = Args.getLastArg(options::OPT_O_Group)) { + StringRef OOpt = "3"; + if (A->getOption().matches(options::OPT_O4) || + A->getOption().matches(options::OPT_Ofast)) + OOpt = "3"; + else if (A->getOption().matches(options::OPT_O0)) + OOpt = "0"; + else if (A->getOption().matches(options::OPT_O)) { + // -Os, -Oz, and -O(anything else) map to -O2 + OOpt = llvm::StringSwitch(A->getValue()) + .Case("1", "1") + .Case("2", "2") + .Case("3", "3") + .Case("s", "2") + .Case("z", "2") + .Default("2"); + } + OptArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt)); + + OptArgs.push_back("-S"); + const char *mcpustr = Args.MakeArgString("-mcpu=" + GFXNAME); + OptArgs.push_back(mcpustr); + OptArgs.push_back("-dce"); + OptArgs.push_back("-sroa"); + OptArgs.push_back("-globaldce"); + } + OptArgs.push_back("-o"); + OptArgs.push_back(Output.getFilename()); + SmallString<128> OptPath(C.getDriver().Dir); + llvm::sys::path::append(OptPath, "opt"); + const char *OptExec = Args.MakeArgString(OptPath); + C.addCommand(llvm::make_unique(JA, *this, OptExec, OptArgs, Inputs)); +} + void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, @@ -400,7 +595,8 @@ Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target, options::OPT_fnoopenmp_relocatable_target, /*Default=*/true); - else if (JA.isOffloading(Action::OFK_Cuda)) + else if (JA.isOffloading(Action::OFK_Cuda) || + JA.isOffloading(Action::OFK_HIP)) Relocatable = Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, /*Default=*/false); @@ -572,10 +768,12 @@ StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ); assert(!GpuArch.empty() && "Must have an explicit GPU arch."); assert((DeviceOffloadingKind == Action::OFK_OpenMP || - DeviceOffloadingKind == Action::OFK_Cuda) && - "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs."); + DeviceOffloadingKind == Action::OFK_Cuda || + DeviceOffloadingKind == Action::OFK_HIP) && + "Only OpenMP, CUDA, or HIP offloading kinds are supported for GPUs."); - if (DeviceOffloadingKind == Action::OFK_Cuda) { + if (DeviceOffloadingKind == Action::OFK_Cuda || + DeviceOffloadingKind == Action::OFK_HIP) { CC1Args.push_back("-fcuda-is-device"); if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero, @@ -754,13 +952,21 @@ return DAL; } +Tool *CudaToolChain::buildDeviceLibraryLinker() const { + return new tools::AMDGCN::DeviceLibraryLinker(*this); +} + Tool *CudaToolChain::buildAssembler() const { + if (getTriple().getArch() == llvm::Triple::amdgcn) + return new tools::AMDGCN::Assembler(*this); return new tools::NVPTX::Assembler(*this); } Tool *CudaToolChain::buildLinker() const { if (OK == Action::OFK_OpenMP) return new tools::NVPTX::OpenMPLinker(*this); + if (getTriple().getArch() == llvm::Triple::amdgcn) + return new tools::AMDGCN::Linker(*this); return new tools::NVPTX::Linker(*this); } Index: test/Driver/cuda-bad-arch.cu =================================================================== --- test/Driver/cuda-bad-arch.cu +++ test/Driver/cuda-bad-arch.cu @@ -2,6 +2,7 @@ // REQUIRES: clang-driver // REQUIRES: x86-registered-target // REQUIRES: nvptx-registered-target +// REQUIRES: amdgpu-registered-target // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=compute_20 -c %s 2>&1 \ // RUN: | FileCheck -check-prefix BAD %s @@ -25,9 +26,12 @@ // RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \ // RUN: | FileCheck -check-prefix OK %s -// We don't allow using NVPTX for host compilation. +// We don't allow using NVPTX/AMDGCN for host compilation. // RUN: %clang -### --cuda-host-only -target nvptx-nvidia-cuda -c %s 2>&1 \ // RUN: | FileCheck -check-prefix HOST_NVPTX %s +// RUN: %clang -### --cuda-host-only -target amdgcn-amd-amdhsa -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix HOST_AMDGCN %s // OK-NOT: error: Unsupported CUDA gpu architecture -// HOST_NVPTX: error: unsupported use of NVPTX for host compilation. +// HOST_NVPTX: error: unsupported architecture 'nvptx' for host compilation. +// HOST_AMDGCN: error: unsupported architecture 'amdgcn' for host compilation. Index: test/Driver/cuda-phases.cu =================================================================== --- test/Driver/cuda-phases.cu +++ test/Driver/cuda-phases.cu @@ -7,195 +7,233 @@ // REQUIRES: clang-driver // REQUIRES: powerpc-registered-target // REQUIRES: nvptx-registered-target - +// REQUIRES: amdgpu-registered-target // // Test single gpu architecture with complete compilation. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \ -// RUN: | FileCheck -check-prefix=BIN %s -// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) -// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) -// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) -// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) -// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) -// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object -// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler -// BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda) -// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir -// BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda) -// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-cuda) -// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-cuda) +// RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s +// BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) +// BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) +// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) +// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) +// BIN_NV-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH:sm_30]]) +// BIN_AMD-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH:gfx803]]) +// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]]) +// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]]) +// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]]) +// BIN_NV-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P7]]}, object +// BIN_AMD-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P7]]}, object +// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH]])" {[[P6]]}, assembler +// BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-[[T]]) +// BIN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P10]]}, ir +// BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) +// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) +// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]]) // // Test single gpu architecture up to the assemble phase. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \ -// RUN: | FileCheck -check-prefix=ASM %s -// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda) -// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda) -// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-cuda) +// RUN: | FileCheck -check-prefixes=ASM,ASM_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s +// ASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]]) +// ASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) +// ASM-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler +// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-[[T]]) +// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (host-[[T]]) +// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-[[T]]) +// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-[[T]]) // // Test two gpu architectures with complete compilation. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ -// RUN: | FileCheck -check-prefix=BIN2 %s -// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) -// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) -// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) -// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) -// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) -// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object -// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler -// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35) -// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35) -// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35) -// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35) -// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object -// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler -// BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda) -// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir -// BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda) -// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-cuda) -// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-cuda) +// RUN: | FileCheck -check-prefixes=BIN2,BIN2_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN2,BIN2_AMD %s +// BIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) +// BIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) +// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) +// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) +// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH1:sm_30|gfx803]]) +// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]]) +// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH1]]) +// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH1]]) +// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH1]]) +// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P7]]}, object +// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH1]])" {[[P6]]}, assembler +// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]]) +// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-[[T]], [[ARCH2]]) +// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-[[T]], [[ARCH2]]) +// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-[[T]], [[ARCH2]]) +// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P14]]}, object +// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P13]]}, assembler +// BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-[[T]]) +// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P17]]}, ir +// BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-[[T]]) +// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-[[T]]) +// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-[[T]]) // // Test two gpu architecturess up to the assemble phase. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \ -// RUN: | FileCheck -check-prefix=ASM2 %s -// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) -// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) -// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) -// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler -// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda) -// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda) -// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-cuda) +// RUN: | FileCheck -check-prefixes=ASM2,ASM2_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=ASM2,ASM2_AMD %s +// ASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH1:sm_30]]) +// ASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH1:gfx803]]) +// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]]) +// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH1]]) +// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH1]]) +// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P3]]}, assembler +// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]]) +// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]]) +// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]]) +// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler +// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-[[T]]) +// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (host-[[T]]) +// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-[[T]]) +// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-[[T]]) // // Test single gpu architecture with complete compilation in host-only // compilation mode. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \ -// RUN: | FileCheck -check-prefix=HBIN %s -// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) -// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) -// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda) -// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda) +// RUN: | FileCheck -check-prefixes=HBIN,HBIN_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefixes=HBIN,HBIN_AMD %s +// HBIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) +// HBIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) +// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) +// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) +// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) +// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]]) +// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]]) // // Test single gpu architecture up to the assemble phase in host-only // compilation mode. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \ -// RUN: | FileCheck -check-prefix=HASM %s -// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) -// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) +// RUN: | FileCheck -check-prefixes=HASM,HASM_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=HASM,HASM_AMD %s +// HASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) +// HASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) +// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) +// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) +// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) // // Test two gpu architectures with complete compilation in host-only // compilation mode. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \ -// RUN: | FileCheck -check-prefix=HBIN2 %s -// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) -// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) -// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda) -// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda) +// RUN: | FileCheck -check-prefixes=HBIN2,HBIN2_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefixes=HBIN2,HBIN2_AMD %s +// HBIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) +// HBIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) +// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) +// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) +// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) +// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]]) +// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]]) // // Test two gpu architectures up to the assemble phase in host-only // compilation mode. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \ -// RUN: | FileCheck -check-prefix=HASM2 %s -// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) -// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) +// RUN: | FileCheck -check-prefixes=HASM2,HASM2_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=HASM2,HASM2_AMD %s +// HASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]]) +// HASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]]) +// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) +// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) +// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) // // Test single gpu architecture with complete compilation in device-only // compilation mode. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \ -// RUN: | FileCheck -check-prefix=DBIN %s -// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) -// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object +// RUN: | FileCheck -check-prefixes=DBIN,DBIN_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefixes=DBIN,DBIN_AMD %s +// DBIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]]) +// DBIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) +// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]]) +// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P4]]}, object // // Test single gpu architecture up to the assemble phase in device-only // compilation mode. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \ -// RUN: | FileCheck -check-prefix=DASM %s -// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler +// RUN: | FileCheck -check-prefixes=DASM,DASM_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=DASM,DASM_AMD %s +// DASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]]) +// DASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) +// DASM-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler // // Test two gpu architectures with complete compilation in device-only // compilation mode. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \ -// RUN: | FileCheck -check-prefix=DBIN2 %s -// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) -// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object -// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35) -// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35) -// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35) -// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35) -// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object +// RUN: | FileCheck -check-prefixes=DBIN2,DBIN2_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefixes=DBIN2,DBIN2_AMD %s +// DBIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]]) +// DBIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) +// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]]) +// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P4]]}, object +// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]]) +// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-[[T]], [[ARCH2]]) +// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-[[T]], [[ARCH2]]) +// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-[[T]], [[ARCH2]]) +// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P10]]}, object // // Test two gpu architectures up to the assemble phase in device-only // compilation mode. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \ -// RUN: | FileCheck -check-prefix=DASM2 %s -// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) -// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) -// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) -// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler +// RUN: | FileCheck -check-prefixes=DASM2,DASM2_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefixes=DASM2,DASM2_AMD %s +// DASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]]) +// DASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) +// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler +// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]]) +// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]]) +// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]]) +// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler