Index: include/clang/Basic/Cuda.h =================================================================== --- include/clang/Basic/Cuda.h +++ include/clang/Basic/Cuda.h @@ -46,6 +46,15 @@ SM_62, SM_70, SM_72, + GFX700, + GFX701, + GFX800, + GFX801, + GFX802, + GFX803, + GFX810, + GFX900, + GFX901, }; const char *CudaArchToString(CudaArch A); @@ -67,6 +76,7 @@ COMPUTE_62, COMPUTE_70, COMPUTE_72, + COMPUTE_GCN, }; const char *CudaVirtualArchToString(CudaVirtualArch A); Index: include/clang/Driver/ToolChain.h =================================================================== --- include/clang/Driver/ToolChain.h +++ include/clang/Driver/ToolChain.h @@ -108,10 +108,12 @@ path_list ProgramPaths; mutable std::unique_ptr Clang; + mutable std::unique_ptr Backend; mutable std::unique_ptr Assemble; mutable std::unique_ptr Link; mutable std::unique_ptr OffloadBundler; Tool *getClang() const; + Tool *getBackend() const; Tool *getAssemble() const; Tool *getLink() const; Tool *getClangAs() const; @@ -139,6 +141,7 @@ void setTripleEnvironment(llvm::Triple::EnvironmentType Env); virtual Tool *buildAssembler() const; + virtual Tool *buildBackend() const; virtual Tool *buildLinker() const; virtual Tool *getTool(Action::ActionClass AC) const; Index: lib/Basic/Cuda.cpp =================================================================== --- lib/Basic/Cuda.cpp +++ lib/Basic/Cuda.cpp @@ -56,6 +56,24 @@ return "sm_70"; case CudaArch::SM_72: return "sm_72"; + case CudaArch::GFX700: // kaveri + return "gfx700"; + case CudaArch::GFX701: // hawaii + return "gfx701"; + case CudaArch::GFX800: // iceland + return "gfx800"; + case CudaArch::GFX801: // carrizo + return "gfx801"; + case CudaArch::GFX802: // + return "gfx802"; + case CudaArch::GFX803: // fiji,polaris10 + return "gfx803"; + case CudaArch::GFX810: // + return "gfx810"; + case CudaArch::GFX900: // Vega + return "gfx900"; + case CudaArch::GFX901: // + return "gfx901"; } llvm_unreachable("invalid enum"); } @@ -76,6 +94,15 @@ .Case("sm_62", CudaArch::SM_62) .Case("sm_70", CudaArch::SM_70) .Case("sm_72", CudaArch::SM_72) + .Case("gfx700", CudaArch::GFX700) + .Case("gfx701", CudaArch::GFX701) + .Case("gfx800", CudaArch::GFX800) + .Case("gfx801", CudaArch::GFX801) + .Case("gfx802", CudaArch::GFX802) + .Case("gfx803", CudaArch::GFX803) + .Case("gfx810", CudaArch::GFX810) + .Case("gfx900", CudaArch::GFX900) + .Case("gfx901", CudaArch::GFX901) .Default(CudaArch::UNKNOWN); } @@ -109,6 +136,8 @@ return "compute_70"; case CudaVirtualArch::COMPUTE_72: return "compute_72"; + case CudaVirtualArch::COMPUTE_GCN: + return "compute_gcn"; } llvm_unreachable("invalid enum"); } @@ -128,6 +157,7 @@ .Case("compute_62", CudaVirtualArch::COMPUTE_62) .Case("compute_70", CudaVirtualArch::COMPUTE_70) .Case("compute_72", CudaVirtualArch::COMPUTE_72) + .Case("compute_gcn", CudaVirtualArch::COMPUTE_GCN) .Default(CudaVirtualArch::UNKNOWN); } @@ -162,6 +192,16 @@ return CudaVirtualArch::COMPUTE_70; case CudaArch::SM_72: return CudaVirtualArch::COMPUTE_72; + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX800: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX901: + return CudaVirtualArch::COMPUTE_GCN; } llvm_unreachable("invalid enum"); } @@ -188,6 +228,16 @@ return CudaVersion::CUDA_90; case CudaArch::SM_72: return CudaVersion::CUDA_91; + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX800: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX901: + return CudaVersion::CUDA_70; } llvm_unreachable("invalid enum"); } @@ -198,6 +248,15 @@ return CudaVersion::UNKNOWN; case CudaArch::SM_20: case CudaArch::SM_21: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX800: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX901: return CudaVersion::CUDA_80; default: return CudaVersion::LATEST; Index: lib/Basic/Targets/AMDGPU.h =================================================================== --- lib/Basic/Targets/AMDGPU.h +++ lib/Basic/Targets/AMDGPU.h @@ -14,6 +14,7 @@ #ifndef LLVM_CLANG_LIB_BASIC_TARGETS_AMDGPU_H #define LLVM_CLANG_LIB_BASIC_TARGETS_AMDGPU_H +#include "clang/Basic/Cuda.h" #include "clang/Basic/TargetInfo.h" #include "clang/Basic/TargetOptions.h" #include "llvm/ADT/StringSet.h" @@ -77,8 +78,11 @@ return TT.getArch() == llvm::Triple::amdgcn; } + CudaArch GCN_Subarch; + static bool isGenericZero(const llvm::Triple &TT) { return TT.getEnvironmentName() == "amdgiz" || + TT.getOS() == llvm::Triple::CUDA || TT.getEnvironmentName() == "amdgizcl"; } @@ -227,7 +231,7 @@ GPU = parseAMDGCNName(Name); else GPU = parseR600Name(Name); - + GCN_Subarch = StringToCudaArch(Name); return GPU != GK_NONE; } @@ -312,6 +316,8 @@ // address space has value 0 but in private and local address space has // value ~0. uint64_t getNullPointerValue(LangAS AS) const override { + if (getTriple().getOS() == llvm::Triple::CUDA) + return 0; return AS == LangAS::opencl_local ? ~0 : 0; } }; Index: lib/Basic/Targets/AMDGPU.cpp =================================================================== --- lib/Basic/Targets/AMDGPU.cpp +++ lib/Basic/Targets/AMDGPU.cpp @@ -328,6 +328,7 @@ : DataLayoutStringSIPrivateIsZero) : DataLayoutStringR600); assert(DataLayout->getAllocaAddrSpace() == AS.Private); + GCN_Subarch = CudaArch::GFX803; // Default to fiji setAddressSpaceMap(Triple.getOS() == llvm::Triple::Mesa3D || Triple.getEnvironment() == llvm::Triple::OpenCL || @@ -359,9 +360,67 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const { - if (getTriple().getArch() == llvm::Triple::amdgcn) - Builder.defineMacro("__AMDGCN__"); - else + if (getTriple().getArch() == llvm::Triple::amdgcn) { + if (getTriple().getOS() == llvm::Triple::CUDA) { + std::string GFXProcCode = [this] { + switch (GCN_Subarch) { + case CudaArch::UNKNOWN: + assert(false && "No GPU arch when compiling CUDA device code."); + return ""; + case CudaArch::SM_20: + return "000"; + case CudaArch::SM_21: + return "000"; + case CudaArch::SM_30: + return "000"; + case CudaArch::SM_32: + return "000"; + case CudaArch::SM_35: + return "000"; + case CudaArch::SM_37: + return "000"; + case CudaArch::SM_50: + return "000"; + case CudaArch::SM_52: + return "000"; + case CudaArch::SM_53: + return "000"; + case CudaArch::SM_60: + return "000"; + case CudaArch::SM_61: + return "000"; + case CudaArch::SM_62: + return "000"; + case CudaArch::SM_70: + return "000"; + case CudaArch::SM_72: + return "000"; + case CudaArch::GFX700: + return "700"; + case CudaArch::GFX701: + return "701"; + case CudaArch::GFX800: + return "800"; + case CudaArch::GFX801: + return "801"; + case CudaArch::GFX802: + return "802"; + case CudaArch::GFX803: + return "803"; + case CudaArch::GFX810: + return "810"; + case CudaArch::GFX900: + return "900"; + case CudaArch::GFX901: + return "901"; + } + llvm_unreachable("unhandled GFX processor"); + }(); + Builder.defineMacro("__AMDGCN__", GFXProcCode); + } else { + Builder.defineMacro("__AMDGCN__"); + } + } else Builder.defineMacro("__R600__"); if (hasFMAF) @@ -370,4 +429,63 @@ Builder.defineMacro("__HAS_LDEXPF__"); if (hasFP64) Builder.defineMacro("__HAS_FP64__"); + if (getTriple().getOS() == llvm::Triple::CUDA) { + // Set __CUDA_ARCH__ for the GPU specified. + std::string CUDAArchCode = [this] { + switch (GCN_Subarch) { + case CudaArch::UNKNOWN: + assert(false && "No GPU arch when compiling CUDA device code."); + return ""; + case CudaArch::SM_20: + return "200"; + case CudaArch::SM_21: + return "210"; + case CudaArch::SM_30: + return "300"; + case CudaArch::SM_32: + return "320"; + case CudaArch::SM_35: + return "350"; + case CudaArch::SM_37: + return "370"; + case CudaArch::SM_50: + return "500"; + case CudaArch::SM_52: + return "520"; + case CudaArch::SM_53: + return "530"; + case CudaArch::SM_60: + return "600"; + case CudaArch::SM_61: + return "610"; + case CudaArch::SM_62: + return "620"; + case CudaArch::SM_70: + return "700"; + case CudaArch::SM_72: + return "720"; + case CudaArch::GFX700: + return "320"; + case CudaArch::GFX701: + return "320"; + case CudaArch::GFX800: + return "320"; + case CudaArch::GFX801: + return "320"; + case CudaArch::GFX802: + return "320"; + case CudaArch::GFX803: + return "320"; + case CudaArch::GFX810: + return "320"; + case CudaArch::GFX900: + return "320"; + case CudaArch::GFX901: + return "320"; + } + llvm_unreachable("unhandled CudaArch"); + }(); + // Set __CUDA_ARCH__ for the GPU specified. + Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode); + } } Index: lib/Basic/Targets/NVPTX.cpp =================================================================== --- lib/Basic/Targets/NVPTX.cpp +++ lib/Basic/Targets/NVPTX.cpp @@ -188,6 +188,24 @@ return "700"; case CudaArch::SM_72: return "720"; + case CudaArch::GFX700: + return "320"; + case CudaArch::GFX701: + return "320"; + case CudaArch::GFX800: + return "320"; + case CudaArch::GFX801: + return "320"; + case CudaArch::GFX802: + return "320"; + case CudaArch::GFX803: + return "320"; + case CudaArch::GFX810: + return "320"; + case CudaArch::GFX900: + return "320"; + case CudaArch::GFX901: + return "320"; } llvm_unreachable("unhandled CudaArch"); }(); Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -2303,7 +2303,8 @@ const ToolChain *HostTC = C.getSingleOffloadToolChain(); assert(HostTC && "No toolchain for host compilation."); - if (HostTC->getTriple().isNVPTX()) { + if (HostTC->getTriple().isNVPTX() || + HostTC->getTriple().getArch() == llvm::Triple::amdgcn) { // We do not support targeting NVPTX for host compilation. Throw // an error and abort pipeline construction early so we don't trip // asserts that assume device-side compilation. @@ -3200,6 +3201,9 @@ bool SaveTemps; bool EmbedBitcode; + /// Type of the input file for the tool + types::ID InputType; + /// Get previous dependent action or null if that does not exist. If /// \a CanBeCollapsed is false, that action must be legal to collapse or /// null will be returned. @@ -3257,6 +3261,8 @@ bool canCollapsePreprocessorAction() const { return !C.getArgs().hasArg(options::OPT_no_integrated_cpp) && !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps && + (InputType != types::TY_LLVM_IR) && + (InputType != types::TY_LLVM_BC) && !C.getArgs().hasArg(options::OPT_rewrite_objc); } @@ -3302,6 +3308,14 @@ if (!AJ || !BJ || !CJ) return nullptr; + // Cannot combine compilation with backend for amdgcn backend + if ((AJ->isOffloading(Action::OFK_Cuda) || + AJ->isOffloading(Action::OFK_OpenMP)) && + (StringRef(AJ->getOffloadingArch()).startswith("gfx") || + TC.getTriple().getArch() == llvm::Triple::amdgcn)) { + return nullptr; + } + // Get compiler tool. const Tool *T = TC.SelectTool(*CJ); if (!T) @@ -3333,6 +3347,14 @@ if (!AJ || !BJ) return nullptr; + // Cannot combine assemble with backend for amdgcn backend + if ((AJ->isOffloading(Action::OFK_Cuda) || + AJ->isOffloading(Action::OFK_OpenMP)) && + (StringRef(AJ->getOffloadingArch()).startswith("gfx") || + TC.getTriple().getArch() == llvm::Triple::amdgcn)) { + return nullptr; + } + // Retrieve the compile job, backend action must always be preceded by one. ActionList CompileJobOffloadActions; auto *CJ = getPrevDependentAction(BJ->getInputs(), CompileJobOffloadActions, @@ -3366,6 +3388,20 @@ if (!BJ || !CJ) return nullptr; + // Cannot combine compilation with backend for amdgcn backend + if ((BJ->isOffloading(Action::OFK_Cuda) || + BJ->isOffloading(Action::OFK_OpenMP)) && + (StringRef(BJ->getOffloadingArch()).startswith("gfx") || + TC.getTriple().getArch() == llvm::Triple::amdgcn)) { + // It is necessary to combine when generating IR for compile-only with + // flags "-c -S -emit-llvm". If only flags "-c -S" the gcn backend is + // needed to generate linked and opt IR for llc, so do not combine. + if (!(C.getArgs().hasArg(options::OPT_c) && + C.getArgs().hasArg(options::OPT_S) && + C.getArgs().hasArg(options::OPT_emit_llvm))) + return nullptr; + } + // Get compiler tool. const Tool *T = TC.SelectTool(*CJ); if (!T) @@ -3409,6 +3445,14 @@ EmbedBitcode(EmbedBitcode) { assert(BaseAction && "Invalid base action."); IsHostSelector = BaseAction->getOffloadingDeviceKind() == Action::OFK_None; + // Store the InputType to check if Compile and Backend can collapse + for (Arg *A : C.getInputArgs()) { + if (A->getOption().getKind() == Option::InputClass) { + const char *Value = A->getValue(); + if (const char *Ext = strrchr(Value, '.')) + InputType = TC.LookupTypeForExtension(Ext + 1); + } + } } /// Check if a chain of actions can be combined and return the tool that can @@ -3909,7 +3953,10 @@ JA.getType() == types::TY_LLVM_BC) Suffixed += ".tmp"; Suffixed += '.'; - Suffixed += Suffix; + if (((StringRef)BaseInput).endswith(".a")) + Suffixed += "a"; + else + Suffixed += Suffix; NamedOutput = C.getArgs().MakeArgString(Suffixed.c_str()); } Index: lib/Driver/SanitizerArgs.cpp =================================================================== --- lib/Driver/SanitizerArgs.cpp +++ lib/Driver/SanitizerArgs.cpp @@ -720,7 +720,8 @@ // NVPTX doesn't currently support sanitizers. Bailing out here means that // e.g. -fsanitize=address applies only to host code, which is what we want // for now. - if (TC.getTriple().isNVPTX()) + if (TC.getTriple().isNVPTX() || + TC.getTriple().getArch() == llvm::Triple::amdgcn) return; // Translate available CoverageFeatures to corresponding clang-cc1 flags. Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -241,6 +241,8 @@ return Clang.get(); } +Tool *ToolChain::buildBackend() const { return new tools::Clang(*this); } + Tool *ToolChain::buildAssembler() const { return new tools::ClangAs(*this); } @@ -255,6 +257,12 @@ return Assemble.get(); } +Tool *ToolChain::getBackend() const { + if (!Backend) + Backend.reset(buildBackend()); + return Backend.get(); +} + Tool *ToolChain::getClangAs() const { if (!Assemble) Assemble.reset(new tools::ClangAs(*this)); @@ -295,8 +303,9 @@ case Action::AnalyzeJobClass: case Action::MigrateJobClass: case Action::VerifyPCHJobClass: - case Action::BackendJobClass: return getClang(); + case Action::BackendJobClass: + return getBackend(); case Action::OffloadBundlingJobClass: case Action::OffloadUnbundlingJobClass: @@ -381,8 +390,22 @@ } Tool *ToolChain::SelectTool(const JobAction &JA) const { - if (getDriver().ShouldUseClangCompiler(JA)) return getClang(); Action::ActionClass AC = JA.getKind(); + // The amdgcn Backend needs buildBackend() + // if ( StringRef(JA.getOffloadingArch()).startswith("gfx") && + if ((JA.isOffloading(Action::OFK_Cuda) || + JA.isOffloading(Action::OFK_OpenMP)) && + (StringRef(JA.getOffloadingArch()).startswith("gfx") || + (getTriple().getArch() == llvm::Triple::amdgcn)) && + (AC == Action::BackendJobClass)) { + if ((Args.hasArg(options::OPT_emit_llvm)) || + (Args.hasArg(options::OPT_emit_llvm_bc))) + return getClang(); // Dont run amdgcn backend if we just want LLVM IR + else + return getTool(AC); + }; + if (getDriver().ShouldUseClangCompiler(JA)) + return getClang(); if (AC == Action::AssembleJobClass && useIntegratedAs()) return getClangAs(); return getTool(AC); Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -2322,9 +2322,10 @@ ArgStringList &CmdArgs, bool KernelOrKext) { const llvm::Triple &EffectiveTriple = TC.getEffectiveTriple(); - // NVPTX doesn't support stack protectors; from the compiler's perspective, it - // doesn't even have a stack! - if (EffectiveTriple.isNVPTX()) + // NVPTX and GCN don't support stack protectors; from the compiler's + // perspective, it doesn't even have a stack! + if (EffectiveTriple.isNVPTX() || + EffectiveTriple.getArch() == llvm::Triple::amdgcn) return; // -stack-protector=0 is default. @@ -3061,7 +3062,16 @@ const ArgList &Args, const char *LinkingOutput) const { const llvm::Triple &RawTriple = getToolChain().getTriple(); const llvm::Triple &Triple = getToolChain().getEffectiveTriple(); - const std::string &TripleStr = Triple.getTriple(); + + bool Is_amdgcn = StringRef(JA.getOffloadingArch()).startswith("gfx") || + (getToolChain().getArch() == llvm::Triple::amdgcn); + // Currently cuda driver only support offload triple nvptx64-nvidia-cuda. + // Switch this from nvptx to amdgcn iff the subarch is a gfx processor. + const std::string &TripleStr = + Is_amdgcn && (JA.isOffloading(Action::OFK_Cuda) || + JA.isOffloading(Action::OFK_OpenMP)) + ? "amdgcn--cuda" + : Triple.getTriple(); bool KernelOrKext = Args.hasArg(options::OPT_mkernel, options::OPT_fapple_kext); @@ -3462,7 +3472,8 @@ // Enable -mconstructor-aliases except on darwin, where we have to work around // a linker bug (see ), and CUDA device code, where // aliases aren't supported. - if (!RawTriple.isOSDarwin() && !RawTriple.isNVPTX()) + if (!RawTriple.isOSDarwin() && !RawTriple.isNVPTX() && + RawTriple.getArch() != llvm::Triple::amdgcn) CmdArgs.push_back("-mconstructor-aliases"); // Darwin's kernel doesn't support guard variables; just die if we Index: lib/Driver/ToolChains/Cuda.h =================================================================== --- lib/Driver/ToolChains/Cuda.h +++ lib/Driver/ToolChains/Cuda.h @@ -11,11 +11,12 @@ #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_CUDA_H #include "clang/Basic/Cuda.h" +#include "clang/Basic/DebugInfoOptions.h" #include "clang/Basic/VersionTuple.h" #include "clang/Driver/Action.h" #include "clang/Driver/Multilib.h" -#include "clang/Driver/ToolChain.h" #include "clang/Driver/Tool.h" +#include "clang/Driver/ToolChain.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Support/Compiler.h" @@ -81,6 +82,20 @@ namespace tools { namespace NVPTX { +// for amdgcn the backend is llvm-link + opt +class LLVM_LIBRARY_VISIBILITY Backend : public Tool { +public: + Backend(const ToolChain &TC) + : Tool("NVPTX::Backend", "GPU-backend", TC, RF_Full, llvm::sys::WEM_UTF8, + "--options-file") {} + virtual bool hasIntegratedCPP() const override { return false; } + virtual void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; + // Run ptxas, the NVPTX assembler. class LLVM_LIBRARY_VISIBILITY Assembler : public Tool { public: @@ -127,6 +142,15 @@ }; } // end namespace NVPTX + +void addBCLib(Compilation &C, const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + llvm::opt::ArgStringList LibraryPaths, const char *BCName); + +void addEnvListWithSpaces(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + const char *EnvVar); + } // end namespace tools namespace toolchains { @@ -184,6 +208,7 @@ CudaInstallationDetector CudaInstallation; protected: + Tool *buildBackend() const override; // for amdgcn, link and opt Tool *buildAssembler() const override; // ptxas Tool *buildLinker() const override; // fatbinary (ok, not really a linker) Index: lib/Driver/ToolChains/Cuda.cpp =================================================================== --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -186,6 +186,43 @@ IsValid = true; break; } + + // Search for GCN Device Libraries + std::string GCNPath; + for (Arg *A : Args) { + if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ) && + StringRef(A->getValue()).startswith("gfx")) { + SmallVector GCNPathCandidates; + if (const char *libamdgcn = getenv("LIBAMDGCN")) + GCNPathCandidates.push_back(D.SysRoot + libamdgcn); + else + GCNPathCandidates.push_back(D.SysRoot + "/opt/rocm/libamdgcn"); + for (const auto &CPath : GCNPathCandidates) { + if (CPath.empty() || !D.getVFS().exists(CPath)) + continue; + GCNPath = CPath; + } + break; + } + } + + // The directory names of GCN device libraries are the gfxnames. + // e.g. /opt/rocm/libamdgcn/gfx701 + if (!GCNPath.empty()) { + auto &FS = D.getVFS(); + std::error_code EC; + for (llvm::sys::fs::directory_iterator LI(GCNPath, EC), LE; !EC && LI != LE; + LI = LI.increment(EC)) { + StringRef Dirname = LI->path(); + StringRef GCNname = Dirname.rsplit('/').second; + if (GCNname.startswith("gfx")) { + std::string OCLFilePath = Dirname.str() + "/lib/opencl.amdgcn.bc"; + if (FS.exists(OCLFilePath)) + LibDeviceMap[GCNname.str()] = OCLFilePath; + } + continue; + } + } } void CudaInstallationDetector::AddCudaIncludeArgs( @@ -237,6 +274,147 @@ << CudaVersionToString(Version) << "\n"; } +void tools::addBCLib(Compilation &C, const ArgList &Args, + ArgStringList &CmdArgs, ArgStringList LibraryPaths, + const char *BCName) { + std::string FullName; + bool FoundLibDevice = false; + for (std::string LibraryPath : LibraryPaths) { + LibraryPath = LibraryPath.substr(2, LibraryPath.length() - 2); // -L + FullName = Args.MakeArgString(LibraryPath + "/" + BCName); + if (llvm::sys::fs::exists(FullName.c_str())) { + FoundLibDevice = true; + break; + } + } + if (!FoundLibDevice) + C.getDriver().Diag(diag::err_drv_no_such_file) << BCName; + CmdArgs.push_back(Args.MakeArgString(FullName)); +} + +void tools::addEnvListWithSpaces(const ArgList &Args, ArgStringList &CmdArgs, + const char *EnvVar) { + const char *DirList = ::getenv(EnvVar); + if (!DirList) + return; + StringRef Dirs(DirList); + if (Dirs.empty()) + return; + StringRef::size_type Delim; + while ((Delim = Dirs.find(" ")) != StringRef::npos) { + if (Delim != 0) + CmdArgs.push_back(Args.MakeArgString(Dirs.substr(0, Delim))); + Dirs = Dirs.substr(Delim + 1); // Trim front space + } + if (!Dirs.empty()) // Last arg may have no spaces + CmdArgs.push_back(Args.MakeArgString(Dirs)); +} + +void NVPTX::Backend::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + + assert(StringRef(JA.getOffloadingArch()).startswith("gfx") && + " unless gfx processor, backend should be clang"); + + // For amdgcn the Backend Job will call 3.9 llvm-link & opt steps + ArgStringList CmdArgs; + // Add the input bc's created by compile step + for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); + it != ie; ++it) { + const InputInfo &II = *it; + CmdArgs.push_back(II.getFilename()); + } + + std::string GFXNAME = JA.getOffloadingArch(); + + ArgStringList LibraryPaths; + + // Always search for bc libs in libamdgcn first + const char *libamdgcn; + libamdgcn = getenv("LIBAMDGCN"); + if (!libamdgcn) + libamdgcn = "/opt/rocm/libamdgcn"; + LibraryPaths.push_back(Args.MakeArgString( + "-L" + std::string(libamdgcn) + "/" + std::string(GFXNAME) + "/lib")); + + // Find in -L and LIBRARY_PATH. + for (auto Arg : Args) { + if (Arg->getSpelling() == "-L") { + std::string Current = "-L"; + Current += Arg->getValue(); + LibraryPaths.push_back(Args.MakeArgString(Current.c_str())); + } + } + addDirectoryList(Args, LibraryPaths, "-L", "LIBRARY_PATH"); + + LibraryPaths.push_back( + Args.MakeArgString("-L" + C.getDriver().Dir + "/../lib/libdevice")); + addBCLib(C, Args, CmdArgs, LibraryPaths, + Args.MakeArgString("libicuda2gcn-" + std::string(GFXNAME) + ".bc")); + + addBCLib(C, Args, CmdArgs, LibraryPaths, "cuda2gcn.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "opencl.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "ockl.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "irif.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "ocml.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_finite_only_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_daz_opt_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, + "oclc_correctly_rounded_sqrt_on.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_unsafe_math_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "hc.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_isa_version.amdgcn.bc"); + + addEnvListWithSpaces(Args, CmdArgs, "CLANG_TARGET_LINK_OPTS"); + CmdArgs.push_back("-suppress-warnings"); + + // Add an intermediate output file which is input to opt + CmdArgs.push_back("-o"); + std::string TmpName = C.getDriver().GetTemporaryPath("OPT_INPUT", "bc"); + const char *ResultingBitcodeF = + C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str())); + CmdArgs.push_back(ResultingBitcodeF); + const char *Exec = Args.MakeArgString(C.getDriver().Dir + "/llvm-link"); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + ArgStringList OptArgs; + // The input to opt is the output from llvm-link + OptArgs.push_back(ResultingBitcodeF); + // Add CLANG_TARGETOPT_OPTS override options to opt + if (getenv("CLANG_TARGET_OPT_OPTS")) + addEnvListWithSpaces(Args, OptArgs, "CLANG_TARGET_OPT_OPTS"); + else { + OptArgs.push_back(Args.MakeArgString("-O2")); + OptArgs.push_back("-S"); + const char *mcpustr = Args.MakeArgString("-mcpu=" + GFXNAME); + OptArgs.push_back(mcpustr); + OptArgs.push_back("-dce"); + OptArgs.push_back("-sroa"); + OptArgs.push_back("-globaldce"); + } + OptArgs.push_back("-o"); + OptArgs.push_back(Output.getFilename()); + const char *OptExec = Args.MakeArgString(C.getDriver().Dir + "/opt"); + C.addCommand(llvm::make_unique(JA, *this, OptExec, OptArgs, Inputs)); + + // If Verbose, save input for llc in /tmp and print all symbols + if (Args.hasArg(options::OPT_v)) { + ArgStringList nmArgs; + nmArgs.push_back(ResultingBitcodeF); + nmArgs.push_back("-debug-syms"); + const char *nmExec = Args.MakeArgString(C.getDriver().Dir + "/llvm-nm"); + C.addCommand(llvm::make_unique(JA, *this, nmExec, nmArgs, Inputs)); + ArgStringList cpArgs; + cpArgs.push_back(Output.getFilename()); + cpArgs.push_back("/tmp/llc_input.ll"); + C.addCommand(llvm::make_unique( + JA, *this, Args.MakeArgString("/bin/cp"), cpArgs, Inputs)); + } +} + void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, @@ -244,7 +422,9 @@ const char *LinkingOutput) const { const auto &TC = static_cast(getToolChain()); - assert(TC.getTriple().isNVPTX() && "Wrong platform"); + assert((TC.getTriple().isNVPTX() || + TC.getTriple().getArch() == llvm::Triple::amdgcn) && + "Wrong platform"); StringRef GPUArchName; // If this is an OpenMP action we need to extract the device architecture @@ -261,6 +441,41 @@ assert(gpu_arch != CudaArch::UNKNOWN && "Device action expected to have an architecture."); + // For amdgcn this job will call llc (Lightning Compiler) + if (StringRef(JA.getOffloadingArch()).startswith("gfx")) { + ArgStringList CmdArgs; + for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); + it != ie; ++it) { + const InputInfo &II = *it; + CmdArgs.push_back(II.getFilename()); + } + CmdArgs.push_back("-mtriple=amdgcn--cuda"); + CmdArgs.push_back("-filetype=obj"); + addEnvListWithSpaces(Args, CmdArgs, "CLANG_TARGET_LLC_OPTS"); + std::string GFXNAME = JA.getOffloadingArch(); + CmdArgs.push_back(Args.MakeArgString("-mcpu=" + GFXNAME)); + CmdArgs.push_back("-o"); + std::string TmpName = C.getDriver().GetTemporaryPath("LC_OUTPUT", "o"); + const char *llcOutputFile = + C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str())); + CmdArgs.push_back(llcOutputFile); + const char *Exec = Args.MakeArgString(C.getDriver().Dir + "/llc"); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + ArgStringList CmdArgs2; + CmdArgs2.push_back("-flavor"); + CmdArgs2.push_back("gnu"); + CmdArgs2.push_back("--no-undefined"); + CmdArgs2.push_back("-shared"); + // The output from ld.lld is an HSA code object file + CmdArgs2.push_back("-o"); + CmdArgs2.push_back(Output.getFilename()); + CmdArgs2.push_back(llcOutputFile); + const char *lld = Args.MakeArgString(C.getDriver().Dir + "/lld"); + C.addCommand(llvm::make_unique(JA, *this, lld, CmdArgs2, Inputs)); + return; + } + // Check that our installation's ptxas supports gpu_arch. if (!Args.hasArg(options::OPT_no_cuda_version_check)) { TC.CudaInstallation.CheckCudaVersionSupportsArch(gpu_arch); @@ -313,7 +528,10 @@ CmdArgs.push_back("--gpu-name"); CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch))); CmdArgs.push_back("--output-file"); - CmdArgs.push_back(Args.MakeArgString(TC.getInputFilename(Output))); + SmallString<256> OutputFileName(Output.getFilename()); + if (JA.isOffloading(Action::OFK_OpenMP)) + llvm::sys::path::replace_extension(OutputFileName, "cubin"); + CmdArgs.push_back(Args.MakeArgString(OutputFileName)); for (const auto& II : Inputs) CmdArgs.push_back(Args.MakeArgString(II.getFilename())); @@ -345,31 +563,53 @@ const char *LinkingOutput) const { const auto &TC = static_cast(getToolChain()); - assert(TC.getTriple().isNVPTX() && "Wrong platform"); + assert((TC.getTriple().isNVPTX() || + TC.getTriple().getArch() == llvm::Triple::amdgcn) && + "Wrong platform"); ArgStringList CmdArgs; CmdArgs.push_back("--cuda"); CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32"); CmdArgs.push_back(Args.MakeArgString("--create")); - CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); + bool found_gfx = false; + for (const auto &II : Inputs) + if (StringRef(II.getAction()->getOffloadingArch()).startswith("gfx")) + found_gfx = true; + const char *fbOutputFile; + if (found_gfx) { + // If gfx, we need clang-fixup-fatbin, so intercept the fatbinary output + std::string TmpName = C.getDriver().GetTemporaryPath("FB_FIXUP", "fatbin"); + fbOutputFile = C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str())); + CmdArgs.push_back(fbOutputFile); + } else + CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); for (const auto& II : Inputs) { auto *A = II.getAction(); - assert(A->getInputs().size() == 1 && - "Device offload action is expected to have a single input"); - const char *gpu_arch_str = A->getOffloadingArch(); - assert(gpu_arch_str && - "Device action expected to have associated a GPU architecture!"); - CudaArch gpu_arch = StringToCudaArch(gpu_arch_str); - - // We need to pass an Arch of the form "sm_XX" for cubin files and - // "compute_XX" for ptx. - const char *Arch = - (II.getType() == types::TY_PP_Asm) - ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch)) - : gpu_arch_str; - CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + - Arch + ",file=" + II.getFilename())); + if (StringRef(A->getOffloadingArch()).startswith("gfx")) { + if (II.getType() != types::TY_PP_Asm) { + CmdArgs.push_back(Args.MakeArgString("--no-asm")); + // LIE to avoid unknown profile in fatbinary + CmdArgs.push_back(Args.MakeArgString( + llvm::Twine("--image=profile=sm_37,file=") + +II.getFilename())); + } + } else { + assert(A->getInputs().size() == 1 && + "Device offload action is expected to have a single input"); + const char *gpu_arch_str = A->getOffloadingArch(); + assert(gpu_arch_str && + "Device action expected to have associated a GPU architecture!"); + CudaArch gpu_arch = StringToCudaArch(gpu_arch_str); + + // We need to pass an Arch of the form "sm_XX" for cubin files and + // "compute_XX" for ptx. + const char *Arch = + (II.getType() == types::TY_PP_Asm) + ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch)) + : gpu_arch_str; + CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + + Arch + ",file=" + II.getFilename())); + } } for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary)) @@ -377,6 +617,30 @@ const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary")); C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + if (found_gfx) { + std::string subarchs = "-offload-archs="; + bool first = true; + for (const auto &II : Inputs) + if (II.getType() != types::TY_PP_Asm) { + if (first) { + subarchs = + subarchs + StringRef(II.getAction()->getOffloadingArch()).str(); + first = false; + } else { + subarchs = subarchs + "," + + StringRef(II.getAction()->getOffloadingArch()).str(); + } + } + ArgStringList CmdArgs2; + CmdArgs2.push_back(Args.MakeArgString(subarchs)); + CmdArgs2.push_back(fbOutputFile); + CmdArgs2.push_back(Args.MakeArgString(Output.getFilename())); + const char *Exec2 = + Args.MakeArgString(C.getDriver().Dir + "/clang-fixup-fatbin"); + C.addCommand( + llvm::make_unique(JA, *this, Exec2, CmdArgs2, Inputs)); + } } void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA, @@ -521,6 +785,12 @@ return; } + // Do not add -link-cuda-bitcode or ptx42 features if gfx + for (Arg *A : DriverArgs) + if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ) && + StringRef(A->getValue()).startswith("gfx")) + return; + CC1Args.push_back("-mlink-cuda-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); @@ -625,6 +895,10 @@ return DAL; } +Tool *CudaToolChain::buildBackend() const { + return new tools::NVPTX::Backend(*this); +} + Tool *CudaToolChain::buildAssembler() const { return new tools::NVPTX::Assembler(*this); } Index: test/Driver/cuda-phases.cu =================================================================== --- test/Driver/cuda-phases.cu +++ test/Driver/cuda-phases.cu @@ -7,22 +7,25 @@ // REQUIRES: clang-driver // REQUIRES: powerpc-registered-target // REQUIRES: nvptx-registered-target - +// REQUIRES: amdgpu-registered-target // // Test single gpu architecture with complete compilation. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \ -// RUN: | FileCheck -check-prefix=BIN %s +// RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s // BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) -// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) -// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) -// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) -// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object -// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler +// BIN_NV-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:sm_30]]) +// BIN_AMD-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:gfx803]]) +// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P7]]}, object +// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P6]]}, assembler // BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda) // BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir // BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda) @@ -34,11 +37,13 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \ // RUN: | FileCheck -check-prefix=ASM %s -// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefix=ASM %s +// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:sm_30|gfx803]]) +// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, [[ARCH]]) +// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P3]]}, assembler // ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda) // ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda) @@ -49,23 +54,25 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ // RUN: | FileCheck -check-prefix=BIN2 %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \ +// RUN: | FileCheck -check-prefix=BIN2 %s // BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) // BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) -// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) -// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) -// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) -// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object -// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler -// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35) -// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35) -// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35) -// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35) -// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object -// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler +// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH1:sm_30|gfx803]]) +// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH1]])" {[[P7]]}, object +// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH1]])" {[[P6]]}, assembler +// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH2:sm_35|gfx900]]) +// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P14]]}, object +// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P13]]}, assembler // BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda) // BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir // BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda) @@ -77,16 +84,18 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \ // RUN: | FileCheck -check-prefix=ASM2 %s -// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) -// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) -// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) -// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefix=ASM2 %s +// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH1:sm_30|gfx803]]) +// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, [[ARCH1]]) +// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH1]]) +// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH1]]) +// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH1]])" {[[P3]]}, assembler +// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH2:sm_35|gfx900]]) +// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, [[ARCH2]]) +// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, [[ARCH2]]) +// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, [[ARCH2]]) +// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P8]]}, assembler // ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda) // ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda) @@ -98,6 +107,8 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \ // RUN: | FileCheck -check-prefix=HBIN %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefix=HBIN %s // HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) // HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) @@ -110,6 +121,8 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=HASM %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=HASM %s // HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) // HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) @@ -121,6 +134,8 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \ // RUN: | FileCheck -check-prefix=HBIN2 %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefix=HBIN2 %s // HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) // HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) @@ -134,6 +149,8 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=HASM2 %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=HASM2 %s // HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) // HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) // HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) @@ -145,12 +162,14 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \ // RUN: | FileCheck -check-prefix=DBIN %s -// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) -// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefix=DBIN %s +// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:sm_30|gfx803]]) +// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P4]]}, object // // Test single gpu architecture up to the assemble phase in device-only @@ -158,11 +177,13 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=DASM %s -// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=DASM %s +// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:sm_30|gfx803]]) +// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, [[ARCH]]) +// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P3]]}, assembler // // Test two gpu architectures with complete compilation in device-only @@ -170,18 +191,20 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \ // RUN: | FileCheck -check-prefix=DBIN2 %s -// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) -// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object -// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35) -// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35) -// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35) -// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35) -// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefix=DBIN2 %s +// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:sm_30|gfx803]]) +// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P4]]}, object +// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH2:sm_35|gfx900]]) +// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P10]]}, object // // Test two gpu architectures up to the assemble phase in device-only @@ -189,13 +212,15 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=DASM2 %s -// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) -// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) -// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) -// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=DASM2 %s +// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH:sm_30|gfx803]]) +// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, [[ARCH]]) +// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P3]]}, assembler +// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, [[ARCH2:sm_35|gfx900]]) +// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, [[ARCH2]]) +// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, [[ARCH2]]) +// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, [[ARCH2]]) +// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P8]]}, assembler