Index: include/clang/Basic/Cuda.h =================================================================== --- include/clang/Basic/Cuda.h +++ include/clang/Basic/Cuda.h @@ -46,6 +46,21 @@ SM_62, SM_70, SM_72, + GFX600, + GFX601, + GFX700, + GFX701, + GFX702, + GFX703, + GFX704, + GFX800, + GFX801, + GFX802, + GFX803, + GFX810, + GFX900, + GFX901, + GFX902, LAST, }; const char *CudaArchToString(CudaArch A); @@ -68,6 +83,7 @@ COMPUTE_62, COMPUTE_70, COMPUTE_72, + COMPUTE_AMDGCN, }; const char *CudaVirtualArchToString(CudaVirtualArch A); Index: include/clang/Driver/ToolChain.h =================================================================== --- include/clang/Driver/ToolChain.h +++ include/clang/Driver/ToolChain.h @@ -108,10 +108,12 @@ path_list ProgramPaths; mutable std::unique_ptr Clang; + mutable std::unique_ptr Backend; mutable std::unique_ptr Assemble; mutable std::unique_ptr Link; mutable std::unique_ptr OffloadBundler; Tool *getClang() const; + Tool *getBackend() const; Tool *getAssemble() const; Tool *getLink() const; Tool *getClangAs() const; @@ -139,6 +141,7 @@ void setTripleEnvironment(llvm::Triple::EnvironmentType Env); virtual Tool *buildAssembler() const; + virtual Tool *buildBackend() const; virtual Tool *buildLinker() const; virtual Tool *getTool(Action::ActionClass AC) const; Index: lib/Basic/Cuda.cpp =================================================================== --- lib/Basic/Cuda.cpp +++ lib/Basic/Cuda.cpp @@ -58,6 +58,36 @@ return "sm_70"; case CudaArch::SM_72: return "sm_72"; + case CudaArch::GFX600://tahiti + return "gfx600"; + case CudaArch::GFX601://pitcairn, verde, oland,hainan + return "gfx601"; + case CudaArch::GFX700://kaveri + return "gfx700"; + case CudaArch::GFX701://hawaii + return "gfx701"; + case CudaArch::GFX702://290,290x,R390,R390x + return "gfx702"; + case CudaArch::GFX703://kabini mullins + return "gfx703"; + case CudaArch::GFX704://bonaire + return "gfx704"; + case CudaArch::GFX800://iceland + return "gfx800"; + case CudaArch::GFX801://carrizo + return "gfx801"; + case CudaArch::GFX802://tonga,iceland + return "gfx802"; + case CudaArch::GFX803://fiji,polaris10 + return "gfx803"; + case CudaArch::GFX810://stoney + return "gfx810"; + case CudaArch::GFX900: //vega, instinct + return "gfx900"; + case CudaArch::GFX901: // + return "gfx901"; + case CudaArch::GFX902: // TBA + return "gfx902"; } llvm_unreachable("invalid enum"); } @@ -78,6 +108,21 @@ .Case("sm_62", CudaArch::SM_62) .Case("sm_70", CudaArch::SM_70) .Case("sm_72", CudaArch::SM_72) + .Case("gfx600", CudaArch::GFX600) + .Case("gfx601", CudaArch::GFX601) + .Case("gfx700", CudaArch::GFX700) + .Case("gfx701", CudaArch::GFX701) + .Case("gfx702", CudaArch::GFX702) + .Case("gfx703", CudaArch::GFX703) + .Case("gfx704", CudaArch::GFX704) + .Case("gfx800", CudaArch::GFX800) + .Case("gfx801", CudaArch::GFX801) + .Case("gfx802", CudaArch::GFX802) + .Case("gfx803", CudaArch::GFX803) + .Case("gfx810", CudaArch::GFX810) + .Case("gfx900", CudaArch::GFX900) + .Case("gfx901", CudaArch::GFX901) + .Case("gfx902", CudaArch::GFX902) .Default(CudaArch::UNKNOWN); } @@ -111,6 +156,8 @@ return "compute_70"; case CudaVirtualArch::COMPUTE_72: return "compute_72"; + case CudaVirtualArch::COMPUTE_AMDGCN: + return "compute_amdgcn"; } llvm_unreachable("invalid enum"); } @@ -130,6 +177,7 @@ .Case("compute_62", CudaVirtualArch::COMPUTE_62) .Case("compute_70", CudaVirtualArch::COMPUTE_70) .Case("compute_72", CudaVirtualArch::COMPUTE_72) + .Case("compute_amdgcn", CudaVirtualArch::COMPUTE_AMDGCN) .Default(CudaVirtualArch::UNKNOWN); } @@ -166,6 +214,22 @@ return CudaVirtualArch::COMPUTE_70; case CudaArch::SM_72: return CudaVirtualArch::COMPUTE_72; + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX800: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX901: + case CudaArch::GFX902: + return CudaVirtualArch::COMPUTE_AMDGCN; } llvm_unreachable("invalid enum"); } @@ -194,6 +258,22 @@ return CudaVersion::CUDA_90; case CudaArch::SM_72: return CudaVersion::CUDA_91; + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX800: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX901: + case CudaArch::GFX902: + return CudaVersion::CUDA_70; } llvm_unreachable("invalid enum"); } @@ -204,6 +284,21 @@ return CudaVersion::UNKNOWN; case CudaArch::SM_20: case CudaArch::SM_21: + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX800: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX901: + case CudaArch::GFX902: return CudaVersion::CUDA_80; default: return CudaVersion::LATEST; Index: lib/Basic/Targets/AMDGPU.h =================================================================== --- lib/Basic/Targets/AMDGPU.h +++ lib/Basic/Targets/AMDGPU.h @@ -14,6 +14,7 @@ #ifndef LLVM_CLANG_LIB_BASIC_TARGETS_AMDGPU_H #define LLVM_CLANG_LIB_BASIC_TARGETS_AMDGPU_H +#include "clang/Basic/Cuda.h" #include "clang/Basic/TargetInfo.h" #include "clang/Basic/TargetOptions.h" #include "llvm/ADT/StringSet.h" @@ -143,6 +144,8 @@ return TT.getArch() == llvm::Triple::amdgcn; } + CudaArch GCN_Subarch; + static bool isGenericZero(const llvm::Triple &TT) { return true; } public: @@ -292,7 +295,7 @@ GPU = parseAMDGCNName(Name); else GPU = parseR600Name(Name); - + GCN_Subarch = StringToCudaArch(Name); return GPU != GK_NONE; } @@ -377,6 +380,7 @@ // address space has value 0 but in private and local address space has // value ~0. uint64_t getNullPointerValue(LangAS AS) const override { + if(getTriple().getOS()==llvm::Triple::CUDA) return 0; return AS == LangAS::opencl_local ? ~0 : 0; } }; Index: lib/Basic/Targets/AMDGPU.cpp =================================================================== --- lib/Basic/Targets/AMDGPU.cpp +++ lib/Basic/Targets/AMDGPU.cpp @@ -293,6 +293,7 @@ : DataLayoutStringSIPrivateIsZero) : DataLayoutStringR600); assert(DataLayout->getAllocaAddrSpace() == AS.Private); + GCN_Subarch = CudaArch::GFX803; // Default to fiji setAddressSpaceMap(Triple.getOS() == llvm::Triple::Mesa3D || Triple.getEnvironment() == llvm::Triple::OpenCL || @@ -324,9 +325,49 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const { - if (getTriple().getArch() == llvm::Triple::amdgcn) - Builder.defineMacro("__AMDGCN__"); - else + if (getTriple().getArch() == llvm::Triple::amdgcn) { + if(getTriple().getOS() == llvm::Triple::CUDA) { + std::string GFXProcCode = [this] { + switch (GCN_Subarch) { + case CudaArch::SM_20: return "000"; + case CudaArch::SM_21: return "000"; + case CudaArch::SM_30: return "000"; + case CudaArch::SM_32: return "000"; + case CudaArch::SM_35: return "000"; + case CudaArch::SM_37: return "000"; + case CudaArch::SM_50: return "000"; + case CudaArch::SM_52: return "000"; + case CudaArch::SM_53: return "000"; + case CudaArch::SM_60: return "000"; + case CudaArch::SM_61: return "000"; + case CudaArch::SM_62: return "000"; + case CudaArch::SM_70: return "000"; + case CudaArch::SM_72: return "000"; + case CudaArch::GFX600: return "600"; + case CudaArch::GFX601: return "601"; + case CudaArch::GFX700: return "700"; + case CudaArch::GFX701: return "701"; + case CudaArch::GFX702: return "702"; + case CudaArch::GFX703: return "703"; + case CudaArch::GFX704: return "704"; + case CudaArch::GFX800: return "800"; + case CudaArch::GFX801: return "801"; + case CudaArch::GFX802: return "802"; + case CudaArch::GFX803: return "803"; + case CudaArch::GFX810: return "810"; + case CudaArch::GFX900: return "900"; + case CudaArch::GFX901: return "901"; + case CudaArch::GFX902: return "902"; + case CudaArch::UNKNOWN:; + case CudaArch::LAST:; + } + llvm_unreachable("unhandled GFX processor"); + }(); + Builder.defineMacro("__AMDGCN__", GFXProcCode); + } else { + Builder.defineMacro("__AMDGCN__"); + } + } else Builder.defineMacro("__R600__"); if (hasFMAF) @@ -335,4 +376,45 @@ Builder.defineMacro("__HAS_LDEXPF__"); if (hasFP64) Builder.defineMacro("__HAS_FP64__"); + if(getTriple().getOS() == llvm::Triple::CUDA) { + // Set __CUDA_ARCH__ for the GPU specified. + std::string CUDAArchCode = [this] { + switch (GCN_Subarch) { + case CudaArch::SM_20: return "200"; + case CudaArch::SM_21: return "210"; + case CudaArch::SM_30: return "300"; + case CudaArch::SM_32: return "320"; + case CudaArch::SM_35: return "350"; + case CudaArch::SM_37: return "370"; + case CudaArch::SM_50: return "500"; + case CudaArch::SM_52: return "520"; + case CudaArch::SM_53: return "530"; + case CudaArch::SM_60: return "600"; + case CudaArch::SM_61: return "610"; + case CudaArch::SM_62: return "620"; + case CudaArch::SM_70: return "700"; + case CudaArch::SM_72: return "720"; + case CudaArch::GFX600: return "320"; + case CudaArch::GFX601: return "320"; + case CudaArch::GFX700: return "320"; + case CudaArch::GFX701: return "320"; + case CudaArch::GFX702: return "320"; + case CudaArch::GFX703: return "320"; + case CudaArch::GFX704: return "320"; + case CudaArch::GFX800: return "320"; + case CudaArch::GFX801: return "320"; + case CudaArch::GFX802: return "320"; + case CudaArch::GFX803: return "320"; + case CudaArch::GFX810: return "320"; + case CudaArch::GFX900: return "320"; + case CudaArch::GFX901: return "320"; + case CudaArch::GFX902: return "320"; + case CudaArch::UNKNOWN:; + case CudaArch::LAST:; + } + llvm_unreachable("unhandled GCN Subarch"); + }(); + // Set __CUDA_ARCH__ for the GPU specified. + Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode); + } } Index: lib/Basic/Targets/NVPTX.cpp =================================================================== --- lib/Basic/Targets/NVPTX.cpp +++ lib/Basic/Targets/NVPTX.cpp @@ -190,6 +190,23 @@ return "700"; case CudaArch::SM_72: return "720"; + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX800: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX901: + case CudaArch::GFX902: + return "320"; + } llvm_unreachable("unhandled CudaArch"); }(); Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -2301,7 +2301,8 @@ const ToolChain *HostTC = C.getSingleOffloadToolChain(); assert(HostTC && "No toolchain for host compilation."); - if (HostTC->getTriple().isNVPTX()) { + if (HostTC->getTriple().isNVPTX() || + HostTC->getTriple().getArch()==llvm::Triple::amdgcn) { // We do not support targeting NVPTX for host compilation. Throw // an error and abort pipeline construction early so we don't trip // asserts that assume device-side compilation. @@ -3198,6 +3199,9 @@ bool SaveTemps; bool EmbedBitcode; + /// Type of the input file for the tool + types::ID InputType; + /// Get previous dependent action or null if that does not exist. If /// \a CanBeCollapsed is false, that action must be legal to collapse or /// null will be returned. @@ -3255,6 +3259,8 @@ bool canCollapsePreprocessorAction() const { return !C.getArgs().hasArg(options::OPT_no_integrated_cpp) && !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps && + (InputType != types::TY_LLVM_IR) && + (InputType != types::TY_LLVM_BC) && !C.getArgs().hasArg(options::OPT_rewrite_objc); } @@ -3300,6 +3306,14 @@ if (!AJ || !BJ || !CJ) return nullptr; + // Cannot combine compilation with backend for amdgcn backend + if(( AJ->isOffloading(Action::OFK_Cuda) || + AJ->isOffloading(Action::OFK_OpenMP)) && + (StringRef(AJ->getOffloadingArch()).startswith("gfx") || + TC.getTriple().getArch() == llvm::Triple::amdgcn)) { + return nullptr; + } + // Get compiler tool. const Tool *T = TC.SelectTool(*CJ); if (!T) @@ -3331,6 +3345,14 @@ if (!AJ || !BJ) return nullptr; + // Cannot combine assemble with backend for amdgcn backend + if(( AJ->isOffloading(Action::OFK_Cuda) || + AJ->isOffloading(Action::OFK_OpenMP)) && + (StringRef(AJ->getOffloadingArch()).startswith("gfx") || + TC.getTriple().getArch() == llvm::Triple::amdgcn)) { + return nullptr; + } + // Retrieve the compile job, backend action must always be preceded by one. ActionList CompileJobOffloadActions; auto *CJ = getPrevDependentAction(BJ->getInputs(), CompileJobOffloadActions, @@ -3364,6 +3386,20 @@ if (!BJ || !CJ) return nullptr; + // Cannot combine compilation with backend for amdgcn backend + if((BJ->isOffloading(Action::OFK_Cuda) || + BJ->isOffloading(Action::OFK_OpenMP)) && + (StringRef(BJ->getOffloadingArch()).startswith("gfx") || + TC.getTriple().getArch() == llvm::Triple::amdgcn)) { + // It is necessary to combine when generating IR for compile-only with + // flags "-c -S -emit-llvm". If only flags "-c -S" the gcn backend is + // needed to generate linked and opt IR for llc, so do not combine. + if( ! ( C.getArgs().hasArg(options::OPT_c) && + C.getArgs().hasArg(options::OPT_S) && + C.getArgs().hasArg(options::OPT_emit_llvm)) ) + return nullptr; + } + // Get compiler tool. const Tool *T = TC.SelectTool(*CJ); if (!T) @@ -3407,6 +3443,14 @@ EmbedBitcode(EmbedBitcode) { assert(BaseAction && "Invalid base action."); IsHostSelector = BaseAction->getOffloadingDeviceKind() == Action::OFK_None; + // Store the InputType to check if Compile and Backend can collapse + for(Arg* A : C.getInputArgs()) { + if (A->getOption().getKind() == Option::InputClass) { + const char *Value = A->getValue(); + if (const char *Ext = strrchr(Value, '.')) + InputType = TC.LookupTypeForExtension(Ext + 1); + } + } } /// Check if a chain of actions can be combined and return the tool that can @@ -3907,7 +3951,10 @@ JA.getType() == types::TY_LLVM_BC) Suffixed += ".tmp"; Suffixed += '.'; - Suffixed += Suffix; + if (((StringRef)BaseInput).endswith(".a")) + Suffixed += "a"; + else + Suffixed += Suffix; NamedOutput = C.getArgs().MakeArgString(Suffixed.c_str()); } Index: lib/Driver/SanitizerArgs.cpp =================================================================== --- lib/Driver/SanitizerArgs.cpp +++ lib/Driver/SanitizerArgs.cpp @@ -720,7 +720,8 @@ // NVPTX doesn't currently support sanitizers. Bailing out here means that // e.g. -fsanitize=address applies only to host code, which is what we want // for now. - if (TC.getTriple().isNVPTX()) + if (TC.getTriple().isNVPTX() || + TC.getTriple().getArch()==llvm::Triple::amdgcn) return; // Translate available CoverageFeatures to corresponding clang-cc1 flags. Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -241,6 +241,10 @@ return Clang.get(); } +Tool *ToolChain::buildBackend() const { + return new tools::Clang(*this); +} + Tool *ToolChain::buildAssembler() const { return new tools::ClangAs(*this); } @@ -255,6 +259,12 @@ return Assemble.get(); } +Tool *ToolChain::getBackend() const { + if(!Backend) + Backend.reset(buildBackend()); + return Backend.get(); +} + Tool *ToolChain::getClangAs() const { if (!Assemble) Assemble.reset(new tools::ClangAs(*this)); @@ -295,8 +305,9 @@ case Action::AnalyzeJobClass: case Action::MigrateJobClass: case Action::VerifyPCHJobClass: - case Action::BackendJobClass: return getClang(); + case Action::BackendJobClass: + return getBackend(); case Action::OffloadBundlingJobClass: case Action::OffloadUnbundlingJobClass: @@ -390,8 +401,21 @@ } Tool *ToolChain::SelectTool(const JobAction &JA) const { - if (getDriver().ShouldUseClangCompiler(JA)) return getClang(); Action::ActionClass AC = JA.getKind(); + // The amdgcn Backend needs buildBackend() + //if ( StringRef(JA.getOffloadingArch()).startswith("gfx") && + if((JA.isOffloading(Action::OFK_Cuda) || + JA.isOffloading(Action::OFK_OpenMP)) && + (StringRef(JA.getOffloadingArch()).startswith("gfx") || + (getTriple().getArch() == llvm::Triple::amdgcn)) && + (AC == Action::BackendJobClass) ) { + if ( (Args.hasArg(options::OPT_emit_llvm)) || + (Args.hasArg(options::OPT_emit_llvm_bc)) ) + return getClang(); // Dont run amdgcn backend if we just want LLVM IR + else + return getTool(AC); + }; + if (getDriver().ShouldUseClangCompiler(JA)) return getClang(); if (AC == Action::AssembleJobClass && useIntegratedAs()) return getClangAs(); return getTool(AC); Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -2322,9 +2322,10 @@ ArgStringList &CmdArgs, bool KernelOrKext) { const llvm::Triple &EffectiveTriple = TC.getEffectiveTriple(); - // NVPTX doesn't support stack protectors; from the compiler's perspective, it - // doesn't even have a stack! - if (EffectiveTriple.isNVPTX()) + // NVPTX and GCN don't support stack protectors; from the compiler's + // perspective, it doesn't even have a stack! + if (EffectiveTriple.isNVPTX() || + EffectiveTriple.getArch()==llvm::Triple::amdgcn) return; // -stack-protector=0 is default. @@ -3065,7 +3066,14 @@ const ArgList &Args, const char *LinkingOutput) const { const llvm::Triple &RawTriple = getToolChain().getTriple(); const llvm::Triple &Triple = getToolChain().getEffectiveTriple(); - const std::string &TripleStr = Triple.getTriple(); + + bool Is_amdgcn = StringRef(JA.getOffloadingArch()).startswith("gfx") || + (getToolChain().getArch() == llvm::Triple::amdgcn) ; + // Currently cuda driver only support offload triple nvptx64-nvidia-cuda. + // Switch this from nvptx to amdgcn iff the subarch is a gfx processor. + const std::string &TripleStr = Is_amdgcn && + (JA.isOffloading(Action::OFK_Cuda) || JA.isOffloading(Action::OFK_OpenMP)) + ? "amdgcn--cuda" : Triple.getTriple(); bool KernelOrKext = Args.hasArg(options::OPT_mkernel, options::OPT_fapple_kext); @@ -3477,7 +3485,8 @@ // Enable -mconstructor-aliases except on darwin, where we have to work around // a linker bug (see ), and CUDA device code, where // aliases aren't supported. - if (!RawTriple.isOSDarwin() && !RawTriple.isNVPTX()) + if (!RawTriple.isOSDarwin() && !RawTriple.isNVPTX() && + RawTriple.getArch()!=llvm::Triple::amdgcn) CmdArgs.push_back("-mconstructor-aliases"); // Darwin's kernel doesn't support guard variables; just die if we Index: lib/Driver/ToolChains/Cuda.h =================================================================== --- lib/Driver/ToolChains/Cuda.h +++ lib/Driver/ToolChains/Cuda.h @@ -12,6 +12,7 @@ #include "clang/Basic/Cuda.h" #include "clang/Basic/VersionTuple.h" +#include "clang/Basic/DebugInfoOptions.h" #include "clang/Driver/Action.h" #include "clang/Driver/Multilib.h" #include "clang/Driver/ToolChain.h" @@ -30,6 +31,7 @@ private: const Driver &D; bool IsValid = false; + bool UseOpenHeaders = false; CudaVersion Version = CudaVersion::UNKNOWN; std::string InstallPath; std::string BinPath; @@ -57,6 +59,7 @@ /// \brief Check whether we detected a valid Cuda install. bool isValid() const { return IsValid; } + bool useOpenHeaders() const { return UseOpenHeaders; } /// \brief Print information about the detected CUDA installation. void print(raw_ostream &OS) const; @@ -81,6 +84,20 @@ namespace tools { namespace NVPTX { +// for amdgcn the backend is llvm-link + opt +class LLVM_LIBRARY_VISIBILITY Backend : public Tool { + public: + Backend(const ToolChain &TC) + : Tool("NVPTX::Backend", "GPU-backend", TC, RF_Full, llvm::sys::WEM_UTF8, + "--options-file") {} + virtual bool hasIntegratedCPP() const override { return false; } + virtual void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; + // Run ptxas, the NVPTX assembler. class LLVM_LIBRARY_VISIBILITY Assembler : public Tool { public: @@ -127,6 +144,16 @@ }; } // end namespace NVPTX + +void addBCLib(Compilation &C, const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + llvm::opt::ArgStringList LibraryPaths, + const char *BCName) ; + +void addEnvListWithSpaces(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + const char *EnvVar) ; + } // end namespace tools namespace toolchains { @@ -184,6 +211,7 @@ CudaInstallationDetector CudaInstallation; protected: + Tool *buildBackend() const override; // for amdgcn, link and opt Tool *buildAssembler() const override; // ptxas Tool *buildLinker() const override; // fatbinary (ok, not really a linker) Index: lib/Driver/ToolChains/Cuda.cpp =================================================================== --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -221,6 +221,49 @@ IsValid = true; break; } + + // Search for GCN Device Libraries + std::string GCNPath; + for (Arg *A : Args) { + if( A->getOption().matches(options::OPT_cuda_gpu_arch_EQ) && + StringRef(A->getValue()).startswith("gfx") ) { + SmallVector GCNPathCandidates; + if (const char *libamdgcn = getenv("LIBAMDGCN")) + GCNPathCandidates.push_back(D.SysRoot + libamdgcn); + else + GCNPathCandidates.push_back(D.SysRoot + "/opt/rocm/libamdgcn"); + for (const auto &CPath : GCNPathCandidates) { + if (CPath.empty() || !D.getVFS().exists(CPath)) + continue; + GCNPath = CPath; + } + break; + } + } + + // The directory names of GCN device libraries are the gfxnames. + // e.g. /opt/rocm/libamdgcn/gfx701 + bool no_cuda_install = !IsValid; + if (! GCNPath.empty()) { + auto &FS = D.getVFS(); + std::error_code EC; + for (llvm::sys::fs::directory_iterator LI(GCNPath, EC), LE; + !EC && LI != LE; LI = LI.increment(EC)) { + StringRef Dirname = LI->path(); + StringRef GCNname = Dirname.rsplit('/').second; + if(GCNname.startswith("gfx")) { + std::string OCLFilePath = Dirname.str() + "/lib/opencl.amdgcn.bc"; + if (FS.exists(OCLFilePath)) { + if (no_cuda_install) + UseOpenHeaders=true; + LibDeviceMap[GCNname.str()] = OCLFilePath; + IsValid = true; + } + } + continue; + } + } + } void CudaInstallationDetector::AddCudaIncludeArgs( @@ -246,7 +289,12 @@ CC1Args.push_back("-internal-isystem"); CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath())); CC1Args.push_back("-include"); - CC1Args.push_back("__clang_cuda_runtime_wrapper.h"); + if (useOpenHeaders()) { + CC1Args.push_back("__clang_cuda_runtime_wrapper_open.h"); + CC1Args.push_back("-D__USE_OPEN_HEADERS__"); + } else { + CC1Args.push_back("__clang_cuda_runtime_wrapper.h"); + } } void CudaInstallationDetector::CheckCudaVersionSupportsArch( @@ -272,6 +320,148 @@ << CudaVersionToString(Version) << "\n"; } +void tools::addBCLib(Compilation &C, const ArgList &Args, + ArgStringList &CmdArgs, ArgStringList LibraryPaths, const char *BCName) { + std::string FullName; + bool FoundLibDevice = false; + for (std::string LibraryPath : LibraryPaths) { + LibraryPath = LibraryPath.substr(2, LibraryPath.length() - 2); // -L + FullName = Args.MakeArgString(LibraryPath + "/" + BCName ); + if (llvm::sys::fs::exists( FullName.c_str())) { + FoundLibDevice = true; + break; + } + } + if (!FoundLibDevice) + C.getDriver().Diag(diag::err_drv_no_such_file) << BCName ; + CmdArgs.push_back(Args.MakeArgString(FullName)); +} + +void tools::addEnvListWithSpaces(const ArgList &Args, ArgStringList &CmdArgs, + const char *EnvVar) { + const char *DirList = ::getenv(EnvVar); + if (!DirList) return; + StringRef Dirs(DirList); + if (Dirs.empty()) return; + StringRef::size_type Delim; + while ((Delim = Dirs.find(" ")) != StringRef::npos) { + if (Delim != 0) + CmdArgs.push_back(Args.MakeArgString(Dirs.substr(0, Delim))); + Dirs = Dirs.substr(Delim + 1); // Trim front space + } + if (!Dirs.empty()) // Last arg may have no spaces + CmdArgs.push_back(Args.MakeArgString(Dirs)); +} + +void NVPTX::Backend::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + + assert(StringRef(JA.getOffloadingArch()).startswith("gfx") && + " unless gfx processor, backend should be clang") ; + + // For amdgcn the Backend Job will call 3.9 llvm-link & opt steps + ArgStringList CmdArgs; + // Add the input bc's created by compile step + for (InputInfoList::const_iterator + it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { + const InputInfo &II = *it; + CmdArgs.push_back(II.getFilename()); + } + + std::string GFXNAME = JA.getOffloadingArch(); + + ArgStringList LibraryPaths; + + // Always search for bc libs in libamdgcn first + const char * libamdgcn; + libamdgcn = getenv("LIBAMDGCN"); + if (!libamdgcn) libamdgcn = "/opt/rocm/libamdgcn"; + LibraryPaths.push_back(Args.MakeArgString( + "-L" + std::string(libamdgcn) + "/" + std::string(GFXNAME) + "/lib")); + + // Find in -L and LIBRARY_PATH. + for (auto Arg : Args) { + if (Arg->getSpelling() == "-L") { + std::string Current = "-L"; + Current += Arg->getValue(); + LibraryPaths.push_back(Args.MakeArgString(Current.c_str())); + } + } + addDirectoryList(Args, LibraryPaths, "-L", "LIBRARY_PATH"); + + LibraryPaths.push_back( + Args.MakeArgString("-L" + C.getDriver().Dir + "/../lib/libdevice")); + addBCLib(C, Args, CmdArgs, LibraryPaths, + Args.MakeArgString("libicuda2gcn-" + std::string(GFXNAME) + ".bc")); + + addBCLib(C, Args, CmdArgs, LibraryPaths, "cuda2gcn.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "opencl.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "ockl.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "irif.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "ocml.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_finite_only_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_daz_opt_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, + "oclc_correctly_rounded_sqrt_on.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_unsafe_math_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "hc.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_isa_version.amdgcn.bc"); + + addEnvListWithSpaces(Args, CmdArgs, "CLANG_TARGET_LINK_OPTS"); + CmdArgs.push_back("-suppress-warnings"); + + // Add an intermediate output file which is input to opt + CmdArgs.push_back("-o"); + std::string TmpName = C.getDriver().GetTemporaryPath("OPT_INPUT", "bc"); + const char *ResultingBitcodeF = + C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str())); + CmdArgs.push_back(ResultingBitcodeF); + const char *Exec = Args.MakeArgString(C.getDriver().Dir + "/llvm-link"); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + ArgStringList OptArgs; + // The input to opt is the output from llvm-link + OptArgs.push_back(ResultingBitcodeF); + // Add CLANG_TARGETOPT_OPTS override options to opt + if (getenv("CLANG_TARGET_OPT_OPTS")) + addEnvListWithSpaces(Args, OptArgs, "CLANG_TARGET_OPT_OPTS"); + else { + if (Arg *A = Args.getLastArg(options::OPT_O_Group)) { + StringRef OOpt = "3"; + if (A->getOption().matches(options::OPT_O4) || + A->getOption().matches(options::OPT_Ofast)) + OOpt = "3"; + else if (A->getOption().matches(options::OPT_O0)) + OOpt = "0"; + else if (A->getOption().matches(options::OPT_O)) { + // -Os, -Oz, and -O(anything else) map to -O2, for lack of better options. + OOpt = llvm::StringSwitch(A->getValue()) + .Case("1", "1") + .Case("2", "2") + .Case("3", "3") + .Case("s", "2") + .Case("z", "2") + .Default("2"); + } + OptArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt)); + } + OptArgs.push_back("-S"); + const char *mcpustr = Args.MakeArgString("-mcpu=" + GFXNAME); + OptArgs.push_back(mcpustr); + OptArgs.push_back("-dce"); + OptArgs.push_back("-sroa"); + OptArgs.push_back("-globaldce"); + } + OptArgs.push_back("-o"); + OptArgs.push_back(Output.getFilename()); + const char *OptExec = Args.MakeArgString(C.getDriver().Dir + "/opt"); + C.addCommand(llvm::make_unique(JA, *this, OptExec, OptArgs, Inputs)); + +} + void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, @@ -279,7 +469,8 @@ const char *LinkingOutput) const { const auto &TC = static_cast(getToolChain()); - assert(TC.getTriple().isNVPTX() && "Wrong platform"); + assert((TC.getTriple().isNVPTX() || + TC.getTriple().getArch()==llvm::Triple::amdgcn) && "Wrong platform"); StringRef GPUArchName; // If this is an OpenMP action we need to extract the device architecture @@ -296,6 +487,41 @@ assert(gpu_arch != CudaArch::UNKNOWN && "Device action expected to have an architecture."); + // For amdgcn this job will call llc (Lightning Compiler) + if (StringRef(JA.getOffloadingArch()).startswith("gfx")) { + ArgStringList CmdArgs; + for (InputInfoList::const_iterator + it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { + const InputInfo &II = *it; + CmdArgs.push_back(II.getFilename()); + } + CmdArgs.push_back("-mtriple=amdgcn--cuda"); + CmdArgs.push_back("-filetype=obj"); + addEnvListWithSpaces(Args, CmdArgs, "CLANG_TARGET_LLC_OPTS"); + std::string GFXNAME = JA.getOffloadingArch(); + CmdArgs.push_back(Args.MakeArgString("-mcpu=" + GFXNAME)); + CmdArgs.push_back("-o"); + std::string TmpName = C.getDriver().GetTemporaryPath("LC_OUTPUT", "o"); + const char *llcOutputFile = C.addTempFile( + C.getArgs().MakeArgString(TmpName.c_str())); + CmdArgs.push_back(llcOutputFile); + const char *Exec = Args.MakeArgString(C.getDriver().Dir + "/llc"); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + ArgStringList CmdArgs2; + CmdArgs2.push_back("-flavor"); + CmdArgs2.push_back("gnu"); + CmdArgs2.push_back("--no-undefined"); + CmdArgs2.push_back("-shared"); + // The output from ld.lld is an HSA code object file + CmdArgs2.push_back("-o"); + CmdArgs2.push_back(Output.getFilename()); + CmdArgs2.push_back(llcOutputFile); + const char *lld = Args.MakeArgString(C.getDriver().Dir + "/lld"); + C.addCommand(llvm::make_unique(JA, *this, lld, CmdArgs2, Inputs)); + return; + } + // Check that our installation's ptxas supports gpu_arch. if (!Args.hasArg(options::OPT_no_cuda_version_check)) { TC.CudaInstallation.CheckCudaVersionSupportsArch(gpu_arch); @@ -386,7 +612,123 @@ const char *LinkingOutput) const { const auto &TC = static_cast(getToolChain()); - assert(TC.getTriple().isNVPTX() && "Wrong platform"); + assert((TC.getTriple().isNVPTX() || + TC.getTriple().getArch()==llvm::Triple::amdgcn) && "Wrong platform"); + + // This job builds composite cubin file from each output of the assemble step + // There are 3 scenarios and the command(s) needed for each. + // 1 mixed targets - run "fatinary" and clang tool "clang-fixup-fatbin" + // 2 only amdgpu targets - Run clang tool "clang-assemble-fatbin" + // 3 only nvptx targets - Run cuda SDK "fatbinary" program + bool found_amdgcn=false; + bool found_nvptx=false; + for (const auto& II : Inputs) { + if(StringRef(II.getAction()->getOffloadingArch()).startswith("gfx")) + found_amdgcn=true; + else + found_nvptx=true; + } + + // 1 Mixed targets, need fatbinary and clang-fixup-fatbin commands ----------- + if (found_amdgcn && found_nvptx) { + + const char *fbOutputFile ; + ArgStringList CmdArgs; + CmdArgs.push_back("--cuda"); + CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32"); + CmdArgs.push_back(Args.MakeArgString("--create")); + std::string TmpName = C.getDriver().GetTemporaryPath("FB_FIXUP", "fatbin"); + fbOutputFile = C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str())); + CmdArgs.push_back(fbOutputFile); + + for (const auto& II : Inputs) { + auto *A = II.getAction(); + if(StringRef(A->getOffloadingArch()).startswith("gfx")) { + if (II.getType() != types::TY_PP_Asm) { + CmdArgs.push_back(Args.MakeArgString("--no-asm")); + // LIE to avoid unknown profile in fatbinary + CmdArgs.push_back(Args.MakeArgString( + llvm::Twine("--image=profile=sm_37,file=") + + II.getFilename())); + } + } else { + assert(A->getInputs().size() == 1 && + "Device offload action is expected to have a single input"); + const char *gpu_arch_str = A->getOffloadingArch(); + assert(gpu_arch_str && + "Device action expected to have associated a GPU architecture!"); + CudaArch gpu_arch = StringToCudaArch(gpu_arch_str); + + // We need to pass an Arch of the form "sm_XX" for cubin files and + // "compute_XX" for ptx. + const char *Arch = + (II.getType() == types::TY_PP_Asm) + ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch)) + : gpu_arch_str; + CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + + Arch + ",file=" + II.getFilename())); + } + } + + for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary)) + CmdArgs.push_back(Args.MakeArgString(A)); + + const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary")); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + // Call clang-fixup-fatbin to correct the sm_37 labels to real gfx labels + std::string subarchs = "-offload-archs="; + bool first = true; + for (const auto& II : Inputs) + if (II.getType() != types::TY_PP_Asm) { + if (first) { + subarchs = subarchs + + StringRef(II.getAction()->getOffloadingArch()).str(); + first = false ; + } else { + subarchs = subarchs + "," + + StringRef(II.getAction()->getOffloadingArch()).str(); + } + } + ArgStringList CmdArgs2; + CmdArgs2.push_back(Args.MakeArgString(subarchs)); + CmdArgs2.push_back(fbOutputFile); + CmdArgs2.push_back(Args.MakeArgString(Output.getFilename())); + const char *Exec2 = Args.MakeArgString(C.getDriver().Dir + + "/clang-fixup-fatbin"); + C.addCommand(llvm::make_unique(JA, *this, Exec2, CmdArgs2, + Inputs)); + return; + } + + // 2 Only amdgcn targets, call clang-assemble-fatbin ------------------------ + if (found_amdgcn) { + + ArgStringList CmdArgs; + std::string subarchs = "-offload-archs="; + bool first = true; + for (const auto& II : Inputs) { + if (II.getType() != types::TY_PP_Asm) { + if (first) { + subarchs = subarchs + + StringRef(II.getAction()->getOffloadingArch()).str(); + first = false ; + } else { + subarchs = subarchs + "," + + StringRef(II.getAction()->getOffloadingArch()).str(); + } + CmdArgs.push_back(II.getFilename()); + } + } + CmdArgs.push_back(Args.MakeArgString(subarchs)); + CmdArgs.push_back("-o"); + CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); + const char *Exec = Args.MakeArgString(C.getDriver().Dir + + "/clang-assemble-fatbin"); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + return; + + } + // 3 Only nvptx targets found, so just call fatbinary ------------------------ ArgStringList CmdArgs; CmdArgs.push_back("--cuda"); @@ -566,6 +908,10 @@ return; } + // Do not add -link-cuda-bitcode or ptx42 features if gfx + if (GpuArch.startswith("gfx")) + return; + CC1Args.push_back("-mlink-cuda-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); @@ -670,6 +1016,10 @@ return DAL; } +Tool *CudaToolChain::buildBackend() const { + return new tools::NVPTX::Backend(*this); +} + Tool *CudaToolChain::buildAssembler() const { return new tools::NVPTX::Assembler(*this); } Index: test/Driver/cuda-phases.cu =================================================================== --- test/Driver/cuda-phases.cu +++ test/Driver/cuda-phases.cu @@ -7,22 +7,25 @@ // REQUIRES: clang-driver // REQUIRES: powerpc-registered-target // REQUIRES: nvptx-registered-target - +// REQUIRES: amdgpu-registered-target // // Test single gpu architecture with complete compilation. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \ -// RUN: | FileCheck -check-prefix=BIN %s +// RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s // BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) -// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) -// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) -// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) -// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object -// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler +// BIN_NV-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:sm_30]]) +// BIN_AMD-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:gfx803]]) +// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P7]]}, object +// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P6]]}, assembler // BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda) // BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir // BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda) @@ -34,11 +37,13 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \ // RUN: | FileCheck -check-prefix=ASM %s -// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefix=ASM %s +// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:sm_30|gfx803]]) +// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, [[ARCH]]) +// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P3]]}, assembler // ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda) // ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda) @@ -49,23 +54,25 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ // RUN: | FileCheck -check-prefix=BIN2 %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \ +// RUN: | FileCheck -check-prefix=BIN2 %s // BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) // BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) -// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) -// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) -// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) -// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object -// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler -// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35) -// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35) -// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35) -// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35) -// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object -// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler +// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH1:sm_30|gfx803]]) +// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH1]])" {[[P7]]}, object +// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH1]])" {[[P6]]}, assembler +// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH2:sm_35|gfx900]]) +// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P14]]}, object +// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P13]]}, assembler // BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda) // BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir // BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda) @@ -77,16 +84,18 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \ // RUN: | FileCheck -check-prefix=ASM2 %s -// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) -// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) -// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) -// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefix=ASM2 %s +// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH1:sm_30|gfx803]]) +// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, [[ARCH1]]) +// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH1]]) +// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH1]]) +// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH1]])" {[[P3]]}, assembler +// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH2:sm_35|gfx900]]) +// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, [[ARCH2]]) +// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, [[ARCH2]]) +// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, [[ARCH2]]) +// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P8]]}, assembler // ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda) // ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda) @@ -98,6 +107,8 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \ // RUN: | FileCheck -check-prefix=HBIN %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefix=HBIN %s // HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) // HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) @@ -110,6 +121,8 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=HASM %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=HASM %s // HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) // HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) @@ -121,6 +134,8 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \ // RUN: | FileCheck -check-prefix=HBIN2 %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefix=HBIN2 %s // HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) // HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) @@ -134,6 +149,8 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=HASM2 %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=HASM2 %s // HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) // HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) @@ -145,12 +162,14 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \ // RUN: | FileCheck -check-prefix=DBIN %s -// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) -// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefix=DBIN %s +// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:sm_30|gfx803]]) +// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P4]]}, object // // Test single gpu architecture up to the assemble phase in device-only @@ -158,11 +177,13 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=DASM %s -// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=DASM %s +// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:sm_30|gfx803]]) +// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, [[ARCH]]) +// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P3]]}, assembler // // Test two gpu architectures with complete compilation in device-only @@ -170,18 +191,20 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \ // RUN: | FileCheck -check-prefix=DBIN2 %s -// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) -// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object -// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35) -// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35) -// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35) -// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35) -// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefix=DBIN2 %s +// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:sm_30|gfx803]]) +// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P4]]}, object +// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH2:sm_35|gfx900]]) +// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P10]]}, object // // Test two gpu architectures up to the assemble phase in device-only @@ -189,13 +212,15 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=DASM2 %s -// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) -// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) -// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) -// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=DASM2 %s +// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:sm_30|gfx803]]) +// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, [[ARCH]]) +// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P3]]}, assembler +// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH2:sm_35|gfx900]]) +// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, [[ARCH2]]) +// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, [[ARCH2]]) +// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, [[ARCH2]]) +// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P8]]}, assembler