Index: include/clang/Basic/Cuda.h =================================================================== --- include/clang/Basic/Cuda.h +++ include/clang/Basic/Cuda.h @@ -46,6 +46,19 @@ SM_62, SM_70, SM_72, + GFX600, + GFX601, + GFX700, + GFX701, + GFX702, + GFX703, + GFX704, + GFX801, + GFX802, + GFX803, + GFX810, + GFX900, + GFX902, LAST, }; const char *CudaArchToString(CudaArch A); @@ -68,6 +81,7 @@ COMPUTE_62, COMPUTE_70, COMPUTE_72, + COMPUTE_AMDGCN, }; const char *CudaVirtualArchToString(CudaVirtualArch A); Index: include/clang/Basic/DiagnosticDriverKinds.td =================================================================== --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -31,6 +31,8 @@ def err_drv_no_cuda_libdevice : Error< "cannot find libdevice for %0. Provide path to different CUDA installation " "via --cuda-path, or pass -nocudalib to build without linking with libdevice.">; +def err_drv_no_hip_libdevice : Error< + "cannot find libdevice for %0. Did you install hcc2-libdevice?">; def err_drv_cuda_version_unsupported : Error< "GPU arch %0 is supported by CUDA versions between %1 and %2 (inclusive), " "but installation at %3 is %4. Use --cuda-path to specify a different CUDA " Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -539,15 +539,21 @@ def c : Flag<["-"], "c">, Flags<[DriverOption]>, Group, HelpText<"Only run preprocess, compile, and assemble steps">; def cuda_device_only : Flag<["--"], "cuda-device-only">, - HelpText<"Compile CUDA code for device only">; + HelpText<"Compile CUDA/HIP code for device only">; +def : Flag<["--"], "device-only">, Alias; def cuda_host_only : Flag<["--"], "cuda-host-only">, - HelpText<"Compile CUDA code for host only. Has no effect on non-CUDA " + HelpText<"Compile CUDA/HIP code for host only. Has no effect on non-CUDA " "compilations.">; +def : Flag<["--"], "host-only">, Alias; def cuda_compile_host_device : Flag<["--"], "cuda-compile-host-device">, - HelpText<"Compile CUDA code for both host and device (default). Has no " + HelpText<"Compile CUDA/HIP code for both host and device (default). Has no " "effect on non-CUDA compilations.">; +def : Flag<["--"], "compile-host-device">, Alias; def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>, - HelpText<"CUDA GPU architecture (e.g. sm_35). May be specified more than once.">; + HelpText<"CUDA/HIP GPU architecture (e.g. sm_35). May be specified more than once.">; +def : Joined<["--"], "offload-arch=">, Alias; +def offload_archs : Joined<["--"], "offload-archs=">, Flags<[DriverOption]>, + HelpText<"List of offload architectures for CUDA/HIP/OpenMP (e.g. sm_35,gfx803).">; def no_cuda_gpu_arch_EQ : Joined<["--"], "no-cuda-gpu-arch=">, Flags<[DriverOption]>, HelpText<"Remove GPU architecture (e.g. sm_35) from the list of GPUs to compile for. " "'all' resets the list to its default value.">; Index: include/clang/Driver/ToolChain.h =================================================================== --- include/clang/Driver/ToolChain.h +++ include/clang/Driver/ToolChain.h @@ -121,11 +121,13 @@ path_list ProgramPaths; mutable std::unique_ptr Clang; + mutable std::unique_ptr Backend; mutable std::unique_ptr Assemble; mutable std::unique_ptr Link; mutable std::unique_ptr OffloadBundler; Tool *getClang() const; + Tool *getBackend() const; Tool *getAssemble() const; Tool *getLink() const; Tool *getClangAs() const; @@ -151,6 +153,7 @@ void setTripleEnvironment(llvm::Triple::EnvironmentType Env); virtual Tool *buildAssembler() const; + virtual Tool *buildBackend() const; virtual Tool *buildLinker() const; virtual Tool *getTool(Action::ActionClass AC) const; Index: include/clang/Driver/Types.def =================================================================== --- include/clang/Driver/Types.def +++ include/clang/Driver/Types.def @@ -46,6 +46,9 @@ TYPE("cuda-cpp-output", PP_CUDA, INVALID, "cui", "u") TYPE("cuda", CUDA, PP_CUDA, "cu", "u") TYPE("cuda", CUDA_DEVICE, PP_CUDA, "cu", "") +TYPE("hip-cpp-output", PP_HIP, INVALID, "cui", "u") +TYPE("hip", HIP, PP_HIP, "cu", "u") +TYPE("hip", HIP_DEVICE, PP_HIP, "cu", "") TYPE("objective-c-cpp-output", PP_ObjC, INVALID, "mi", "u") TYPE("objc-cpp-output", PP_ObjC_Alias, INVALID, "mi", "u") TYPE("objective-c", ObjC, PP_ObjC, "m", "u") Index: lib/Basic/Cuda.cpp =================================================================== --- lib/Basic/Cuda.cpp +++ lib/Basic/Cuda.cpp @@ -58,6 +58,32 @@ return "sm_70"; case CudaArch::SM_72: return "sm_72"; + case CudaArch::GFX600: // tahiti + return "gfx600"; + case CudaArch::GFX601: // pitcairn, verde, oland,hainan + return "gfx601"; + case CudaArch::GFX700: // kaveri + return "gfx700"; + case CudaArch::GFX701: // hawaii + return "gfx701"; + case CudaArch::GFX702: // 290,290x,R390,R390x + return "gfx702"; + case CudaArch::GFX703: // kabini mullins + return "gfx703"; + case CudaArch::GFX704: // bonaire + return "gfx704"; + case CudaArch::GFX801: // carrizo + return "gfx801"; + case CudaArch::GFX802: // tonga,iceland + return "gfx802"; + case CudaArch::GFX803: // fiji,polaris10 + return "gfx803"; + case CudaArch::GFX810: // stoney + return "gfx810"; + case CudaArch::GFX900: // vega, instinct + return "gfx900"; + case CudaArch::GFX902: // TBA + return "gfx902"; } llvm_unreachable("invalid enum"); } @@ -78,6 +104,19 @@ .Case("sm_62", CudaArch::SM_62) .Case("sm_70", CudaArch::SM_70) .Case("sm_72", CudaArch::SM_72) + .Case("gfx600", CudaArch::GFX600) + .Case("gfx601", CudaArch::GFX601) + .Case("gfx700", CudaArch::GFX700) + .Case("gfx701", CudaArch::GFX701) + .Case("gfx702", CudaArch::GFX702) + .Case("gfx703", CudaArch::GFX703) + .Case("gfx704", CudaArch::GFX704) + .Case("gfx801", CudaArch::GFX801) + .Case("gfx802", CudaArch::GFX802) + .Case("gfx803", CudaArch::GFX803) + .Case("gfx810", CudaArch::GFX810) + .Case("gfx900", CudaArch::GFX900) + .Case("gfx902", CudaArch::GFX902) .Default(CudaArch::UNKNOWN); } @@ -111,6 +150,8 @@ return "compute_70"; case CudaVirtualArch::COMPUTE_72: return "compute_72"; + case CudaVirtualArch::COMPUTE_AMDGCN: + return "compute_amdgcn"; } llvm_unreachable("invalid enum"); } @@ -130,6 +171,7 @@ .Case("compute_62", CudaVirtualArch::COMPUTE_62) .Case("compute_70", CudaVirtualArch::COMPUTE_70) .Case("compute_72", CudaVirtualArch::COMPUTE_72) + .Case("compute_amdgcn", CudaVirtualArch::COMPUTE_AMDGCN) .Default(CudaVirtualArch::UNKNOWN); } @@ -166,6 +208,20 @@ return CudaVirtualArch::COMPUTE_70; case CudaArch::SM_72: return CudaVirtualArch::COMPUTE_72; + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX902: + return CudaVirtualArch::COMPUTE_AMDGCN; } llvm_unreachable("invalid enum"); } @@ -194,6 +250,20 @@ return CudaVersion::CUDA_90; case CudaArch::SM_72: return CudaVersion::CUDA_91; + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX902: + return CudaVersion::CUDA_70; } llvm_unreachable("invalid enum"); } @@ -204,6 +274,19 @@ return CudaVersion::UNKNOWN; case CudaArch::SM_20: case CudaArch::SM_21: + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX902: return CudaVersion::CUDA_80; default: return CudaVersion::LATEST; Index: lib/Basic/Targets.h =================================================================== --- lib/Basic/Targets.h +++ lib/Basic/Targets.h @@ -16,6 +16,7 @@ #ifndef LLVM_CLANG_LIB_BASIC_TARGETS_H #define LLVM_CLANG_LIB_BASIC_TARGETS_H +#include "clang/Basic/Cuda.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/MacroBuilder.h" #include "clang/Basic/TargetInfo.h" @@ -46,6 +47,9 @@ LLVM_LIBRARY_VISIBILITY void addCygMingDefines(const clang::LangOptions &Opts, clang::MacroBuilder &Builder); + +LLVM_LIBRARY_VISIBILITY +void defineCudaArchMacro(CudaArch GPU, clang::MacroBuilder &Builder); } // namespace targets } // namespace clang #endif // LLVM_CLANG_LIB_BASIC_TARGETS_H Index: lib/Basic/Targets.cpp =================================================================== --- lib/Basic/Targets.cpp +++ lib/Basic/Targets.cpp @@ -112,6 +112,61 @@ addCygMingDefines(Opts, Builder); } +void defineCudaArchMacro(CudaArch GPU, clang::MacroBuilder &Builder) { + std::string CUDAArchCode = [GPU] { + switch (GPU) { + case CudaArch::LAST: + break; + case CudaArch::SM_20: + return "200"; + case CudaArch::SM_21: + return "210"; + case CudaArch::SM_30: + return "300"; + case CudaArch::SM_32: + return "320"; + case CudaArch::SM_35: + return "350"; + case CudaArch::SM_37: + return "370"; + case CudaArch::SM_50: + return "500"; + case CudaArch::SM_52: + return "520"; + case CudaArch::SM_53: + return "530"; + case CudaArch::SM_60: + return "600"; + case CudaArch::SM_61: + return "610"; + case CudaArch::SM_62: + return "620"; + case CudaArch::SM_70: + return "700"; + case CudaArch::SM_72: + return "720"; + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX902: + return "320"; + case CudaArch::UNKNOWN: + llvm_unreachable("unhandled Cuda/HIP Arch"); + } + llvm_unreachable("unhandled Cuda/HIP Arch"); + }(); + Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode); +} + //===----------------------------------------------------------------------===// // Driver code //===----------------------------------------------------------------------===// Index: lib/Basic/Targets/AMDGPU.h =================================================================== --- lib/Basic/Targets/AMDGPU.h +++ lib/Basic/Targets/AMDGPU.h @@ -14,6 +14,7 @@ #ifndef LLVM_CLANG_LIB_BASIC_TARGETS_AMDGPU_H #define LLVM_CLANG_LIB_BASIC_TARGETS_AMDGPU_H +#include "clang/Basic/Cuda.h" #include "clang/Basic/TargetInfo.h" #include "clang/Basic/TargetOptions.h" #include "llvm/ADT/StringSet.h" @@ -174,6 +175,7 @@ static bool isAMDGCN(const llvm::Triple &TT) { return TT.getArch() == llvm::Triple::amdgcn; } + CudaArch GCN_Subarch; public: AMDGPUTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts); @@ -330,6 +332,7 @@ else GPU = parseR600Name(Name); + GCN_Subarch = StringToCudaArch(Name); return GK_NONE != GPU.Kind; } Index: lib/Basic/Targets/AMDGPU.cpp =================================================================== --- lib/Basic/Targets/AMDGPU.cpp +++ lib/Basic/Targets/AMDGPU.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "Targets.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/MacroBuilder.h" @@ -263,6 +264,7 @@ resetDataLayout(isAMDGCN(getTriple()) ? DataLayoutStringAMDGCN : DataLayoutStringR600); assert(DataLayout->getAllocaAddrSpace() == Private); + GCN_Subarch = CudaArch::GFX803; // Default to fiji setAddressSpaceMap(Triple.getOS() == llvm::Triple::Mesa3D || !isAMDGCN(Triple)); @@ -307,6 +309,9 @@ if (GPU.Kind != GK_NONE) Builder.defineMacro(Twine("__") + Twine(GPU.CanonicalName) + Twine("__")); + if (Opts.CUDAIsDevice) + defineCudaArchMacro(GCN_Subarch, Builder); + // TODO: __HAS_FMAF__, __HAS_LDEXPF__, __HAS_FP64__ are deprecated and will be // removed in the near future. if (GPU.HasFMAF) Index: lib/Basic/Targets/NVPTX.cpp =================================================================== --- lib/Basic/Targets/NVPTX.cpp +++ lib/Basic/Targets/NVPTX.cpp @@ -153,48 +153,8 @@ MacroBuilder &Builder) const { Builder.defineMacro("__PTX__"); Builder.defineMacro("__NVPTX__"); - if (Opts.CUDAIsDevice) { - // Set __CUDA_ARCH__ for the GPU specified. - std::string CUDAArchCode = [this] { - switch (GPU) { - case CudaArch::LAST: - break; - case CudaArch::UNKNOWN: - assert(false && "No GPU arch when compiling CUDA device code."); - return ""; - case CudaArch::SM_20: - return "200"; - case CudaArch::SM_21: - return "210"; - case CudaArch::SM_30: - return "300"; - case CudaArch::SM_32: - return "320"; - case CudaArch::SM_35: - return "350"; - case CudaArch::SM_37: - return "370"; - case CudaArch::SM_50: - return "500"; - case CudaArch::SM_52: - return "520"; - case CudaArch::SM_53: - return "530"; - case CudaArch::SM_60: - return "600"; - case CudaArch::SM_61: - return "610"; - case CudaArch::SM_62: - return "620"; - case CudaArch::SM_70: - return "700"; - case CudaArch::SM_72: - return "720"; - } - llvm_unreachable("unhandled CudaArch"); - }(); - Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode); - } + if (Opts.CUDAIsDevice) + defineCudaArchMacro(GPU, Builder); } ArrayRef NVPTXTargetInfo::getTargetBuiltins() const { Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -2249,9 +2249,10 @@ assert(!GpuArchList.empty() && "We should have at least one GPU architecture."); - // If the host input is not CUDA, we don't need to bother about this - // input. - if (IA->getType() != types::TY_CUDA) { + // If the host input is not CUDA or HIP, we don't need to bother about + // this input. + if ((IA->getType() != types::TY_CUDA) && + IA->getType() != types::TY_HIP) { // The builder will ignore this input. IsActive = false; return ABRT_Inactive; @@ -2264,9 +2265,12 @@ return ABRT_Success; // Replicate inputs for each GPU architecture. - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) - CudaDeviceActions.push_back(C.MakeAction( - IA->getInputArg(), types::TY_CUDA_DEVICE)); + auto Ty = IA->getType() == types::TY_HIP ? types::TY_HIP_DEVICE + : types::TY_CUDA_DEVICE; + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + CudaDeviceActions.push_back( + C.MakeAction(IA->getInputArg(), Ty)); + } return ABRT_Success; } @@ -2314,7 +2318,8 @@ const ToolChain *HostTC = C.getSingleOffloadToolChain(); assert(HostTC && "No toolchain for host compilation."); - if (HostTC->getTriple().isNVPTX()) { + if (HostTC->getTriple().isNVPTX() || + HostTC->getTriple().getArch() == llvm::Triple::amdgcn) { // We do not support targeting NVPTX for host compilation. Throw // an error and abort pipeline construction early so we don't trip // asserts that assume device-side compilation. @@ -3212,6 +3217,9 @@ bool SaveTemps; bool EmbedBitcode; + /// Type of the input file for the tool + types::ID InputType; + /// Get previous dependent action or null if that does not exist. If /// \a CanBeCollapsed is false, that action must be legal to collapse or /// null will be returned. @@ -3269,6 +3277,8 @@ bool canCollapsePreprocessorAction() const { return !C.getArgs().hasArg(options::OPT_no_integrated_cpp) && !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps && + (InputType != types::TY_LLVM_IR) && + (InputType != types::TY_LLVM_BC) && !C.getArgs().hasArg(options::OPT_rewrite_objc); } @@ -3293,6 +3303,11 @@ ActionInfo[I].SavedOffloadAction.end()); } + static bool isAMDGPUCUDAOffloading(const Action *A, llvm::Triple T) { + return A->isOffloading(Action::OFK_Cuda) && + (StringRef(A->getOffloadingArch()).startswith("gfx") || + T.getArch() == llvm::Triple::amdgcn); + } /// Functions that attempt to perform the combining. They detect if that is /// legal, and if so they update the inputs \a Inputs and the offload action /// that were collapsed in \a CollapsedOffloadAction. A tool that deals with @@ -3314,6 +3329,10 @@ if (!AJ || !BJ || !CJ) return nullptr; + // Cannot combine compilation with backend for amdgcn backend + if (!isAMDGPUCUDAOffloading(AJ, TC.getTriple())) + return nullptr; + // Get compiler tool. const Tool *T = TC.SelectTool(*CJ); if (!T) @@ -3345,6 +3364,10 @@ if (!AJ || !BJ) return nullptr; + // Cannot combine assemble with backend for amdgcn backend + if (isAMDGPUCUDAOffloading(AJ, TC.getTriple())) + return nullptr; + // Retrieve the compile job, backend action must always be preceded by one. ActionList CompileJobOffloadActions; auto *CJ = getPrevDependentAction(BJ->getInputs(), CompileJobOffloadActions, @@ -3378,6 +3401,16 @@ if (!BJ || !CJ) return nullptr; + // Cannot combine compilation with backend for amdgcn backend. However + // it is necessary to combine when generating IR for compile-only with + // flags "-c -S -emit-llvm". If only flags "-c -S" the gcn backend is + // needed to generate linked and opt IR for llc, so do not combine. + if (isAMDGPUCUDAOffloading(BJ, TC.getTriple()) && + !(C.getArgs().hasArg(options::OPT_c) && + C.getArgs().hasArg(options::OPT_S) && + C.getArgs().hasArg(options::OPT_emit_llvm))) + return nullptr; + // Get compiler tool. const Tool *T = TC.SelectTool(*CJ); if (!T) @@ -3421,6 +3454,14 @@ EmbedBitcode(EmbedBitcode) { assert(BaseAction && "Invalid base action."); IsHostSelector = BaseAction->getOffloadingDeviceKind() == Action::OFK_None; + // Store the InputType to check if Compile and Backend can collapse + for (Arg *A : C.getInputArgs()) { + if (A->getOption().getKind() == Option::InputClass) { + const char *Value = A->getValue(); + if (const char *Ext = strrchr(Value, '.')) + InputType = TC.LookupTypeForExtension(Ext + 1); + } + } } /// Check if a chain of actions can be combined and return the tool that can @@ -3849,8 +3890,13 @@ CCGenDiagnostics) { StringRef Name = llvm::sys::path::filename(BaseInput); std::pair Split = Name.split('.'); + SmallString<128> fname(Split.first.str().c_str()); + if (!BoundArch.empty()) { + fname += "-"; + fname.append(BoundArch); + } std::string TmpName = GetTemporaryPath( - Split.first, types::getTypeTempSuffix(JA.getType(), IsCLMode())); + fname, types::getTypeTempSuffix(JA.getType(), IsCLMode())); return C.addTempFile(C.getArgs().MakeArgString(TmpName)); } @@ -3921,7 +3967,10 @@ JA.getType() == types::TY_LLVM_BC) Suffixed += ".tmp"; Suffixed += '.'; - Suffixed += Suffix; + if (((StringRef)BaseInput).endswith(".a")) + Suffixed += "a"; + else + Suffixed += Suffix; NamedOutput = C.getArgs().MakeArgString(Suffixed.c_str()); } Index: lib/Driver/SanitizerArgs.cpp =================================================================== --- lib/Driver/SanitizerArgs.cpp +++ lib/Driver/SanitizerArgs.cpp @@ -726,7 +726,8 @@ // NVPTX doesn't currently support sanitizers. Bailing out here means that // e.g. -fsanitize=address applies only to host code, which is what we want // for now. - if (TC.getTriple().isNVPTX()) + if (TC.getTriple().isNVPTX() || + TC.getTriple().getArch() == llvm::Triple::amdgcn) return; // Translate available CoverageFeatures to corresponding clang-cc1 flags. Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -253,6 +253,8 @@ return Clang.get(); } +Tool *ToolChain::buildBackend() const { return new tools::Clang(*this); } + Tool *ToolChain::buildAssembler() const { return new tools::ClangAs(*this); } @@ -267,6 +269,12 @@ return Assemble.get(); } +Tool *ToolChain::getBackend() const { + if (!Backend) + Backend.reset(buildBackend()); + return Backend.get(); +} + Tool *ToolChain::getClangAs() const { if (!Assemble) Assemble.reset(new tools::ClangAs(*this)); @@ -307,8 +315,9 @@ case Action::AnalyzeJobClass: case Action::MigrateJobClass: case Action::VerifyPCHJobClass: - case Action::BackendJobClass: return getClang(); + case Action::BackendJobClass: + return getBackend(); case Action::OffloadBundlingJobClass: case Action::OffloadUnbundlingJobClass: @@ -406,8 +415,22 @@ } Tool *ToolChain::SelectTool(const JobAction &JA) const { - if (getDriver().ShouldUseClangCompiler(JA)) return getClang(); Action::ActionClass AC = JA.getKind(); + // The amdgcn Backend needs buildBackend() + // if ( StringRef(JA.getOffloadingArch()).startswith("gfx") && + if ((JA.isOffloading(Action::OFK_Cuda) || + JA.isOffloading(Action::OFK_OpenMP)) && + (StringRef(JA.getOffloadingArch()).startswith("gfx") || + (getTriple().getArch() == llvm::Triple::amdgcn)) && + (AC == Action::BackendJobClass)) { + if ((Args.hasArg(options::OPT_emit_llvm)) || + (Args.hasArg(options::OPT_emit_llvm_bc))) + return getClang(); // Dont run amdgcn backend if we just want LLVM IR + else + return getTool(AC); + }; + if (getDriver().ShouldUseClangCompiler(JA)) + return getClang(); if (AC == Action::AssembleJobClass && useIntegratedAs()) return getClangAs(); return getTool(AC); Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -2324,9 +2324,10 @@ ArgStringList &CmdArgs, bool KernelOrKext) { const llvm::Triple &EffectiveTriple = TC.getEffectiveTriple(); - // NVPTX doesn't support stack protectors; from the compiler's perspective, it - // doesn't even have a stack! - if (EffectiveTriple.isNVPTX()) + // NVPTX and GCN don't support stack protectors; from the compiler's + // perspective, it doesn't even have a stack! + if (EffectiveTriple.isNVPTX() || + EffectiveTriple.getArch() == llvm::Triple::amdgcn) return; // -stack-protector=0 is default. @@ -3080,7 +3081,17 @@ const ArgList &Args, const char *LinkingOutput) const { const llvm::Triple &RawTriple = getToolChain().getTriple(); const llvm::Triple &Triple = getToolChain().getEffectiveTriple(); - const std::string &TripleStr = Triple.getTriple(); + + bool Is_amdgcn = StringRef(JA.getOffloadingArch()).startswith("gfx") || + (getToolChain().getArch() == llvm::Triple::amdgcn); + // Currently cuda driver only support offload triple nvptx64-nvidia-cuda. + // Switch this from nvptx to amdgcn iff the subarch is a gfx processor. + // We cannot fix Driver.cpp because we want to offload to multiple archs. + const std::string &TripleStr = + Is_amdgcn && (JA.isOffloading(Action::OFK_Cuda) || + JA.isOffloading(Action::OFK_OpenMP)) + ? "amdgcn-amd-amdhsa" + : Triple.getTriple(); bool KernelOrKext = Args.hasArg(options::OPT_mkernel, options::OPT_fapple_kext); @@ -3496,7 +3507,8 @@ // Enable -mconstructor-aliases except on darwin, where we have to work around // a linker bug (see ), and CUDA device code, where // aliases aren't supported. - if (!RawTriple.isOSDarwin() && !RawTriple.isNVPTX()) + if (!RawTriple.isOSDarwin() && !RawTriple.isNVPTX() && + RawTriple.getArch() != llvm::Triple::amdgcn) CmdArgs.push_back("-mconstructor-aliases"); // Darwin's kernel doesn't support guard variables; just die if we Index: lib/Driver/ToolChains/Cuda.h =================================================================== --- lib/Driver/ToolChains/Cuda.h +++ lib/Driver/ToolChains/Cuda.h @@ -30,6 +30,7 @@ private: const Driver &D; bool IsValid = false; + bool UseOpenHeaders = false; CudaVersion Version = CudaVersion::UNKNOWN; std::string InstallPath; std::string BinPath; @@ -57,6 +58,7 @@ /// \brief Check whether we detected a valid Cuda install. bool isValid() const { return IsValid; } + bool useOpenHeaders() const { return UseOpenHeaders; } /// \brief Print information about the detected CUDA installation. void print(raw_ostream &OS) const; @@ -127,6 +129,29 @@ }; } // end namespace NVPTX + +namespace AMDGCN { +// for amdgcn the backend is llvm-link + opt +class LLVM_LIBRARY_VISIBILITY Backend : public Tool { +public: + Backend(const ToolChain &TC) + : Tool("AMDGCN::Backend", "GPU-backend", TC, RF_Full, llvm::sys::WEM_UTF8, + "--options-file") {} + virtual bool hasIntegratedCPP() const override { return false; } + virtual void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; +} // end namespace AMDGCN +bool addBCLib(Compilation &C, const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + llvm::opt::ArgStringList LibraryPaths, const char *BCName); + +bool addEnvListWithSpaces(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + const char *EnvVar); } // end namespace tools namespace toolchains { @@ -184,6 +209,7 @@ CudaInstallationDetector CudaInstallation; protected: + Tool *buildBackend() const override; // for amdgcn, link and opt Tool *buildAssembler() const override; // ptxas Tool *buildLinker() const override; // fatbinary (ok, not really a linker) Index: lib/Driver/ToolChains/Cuda.cpp =================================================================== --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -222,10 +222,55 @@ IsValid = true; break; } + + ArgStringList LibraryPaths; + for (auto Arg : Args) { + if (Arg->getSpelling() == "-L") { + llvm::Twine DevicePath = Twine(Arg->getValue()).concat("/libdevice/"); + if (D.getVFS().exists(DevicePath)) { + LibraryPaths.push_back(Args.MakeArgString(DevicePath)); + } else { + if (D.getVFS().exists(Arg->getValue())) + LibraryPaths.push_back(Arg->getValue()); + } + } + } + LibraryPaths.push_back(Args.MakeArgString(D.Dir + "/../lib/libdevice/")); + + // Search for GCN Device Libraries + for (Arg *A : Args) { + if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ) && + StringRef(A->getValue()).startswith("gfx")) { + StringRef GFXNAME = A->getValue(); + for (auto LP : LibraryPaths) { + StringRef GCNPath = Args.MakeArgString(LP + GFXNAME); + if (D.getVFS().exists(GCNPath)) { + UseOpenHeaders = true; + LibDeviceMap[GFXNAME.str()] = GCNPath; + break; + } + } + } + } } void CudaInstallationDetector::AddCudaIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { + + if (DriverArgs.hasArg(options::OPT_x) && + (((StringRef)DriverArgs.getLastArg(options::OPT_x)->getValue()) == + "hip")) { + + // HIP needs c++11 + CC1Args.push_back("-std=c++11"); + // Skip cuda includes on host & device passes if a single gfx specified + // We want cuda includes for hip on nvidia targets. + for (Arg *A : DriverArgs) { + if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ) && + StringRef(A->getValue()).startswith("gfx")) + return; + } + } if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { // Add cuda_wrappers/* to our system include path. This lets us wrap // standard library headers. @@ -239,7 +284,7 @@ if (DriverArgs.hasArg(options::OPT_nocudainc)) return; - if (!isValid()) { + if (!isValid() && !useOpenHeaders()) { D.Diag(diag::err_drv_no_cuda_installation); return; } @@ -247,7 +292,10 @@ CC1Args.push_back("-internal-isystem"); CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath())); CC1Args.push_back("-include"); - CC1Args.push_back("__clang_cuda_runtime_wrapper.h"); + if (useOpenHeaders()) + CC1Args.push_back("__clang_cuda_runtime_wrapper_open.h"); + else + CC1Args.push_back("__clang_cuda_runtime_wrapper.h"); } void CudaInstallationDetector::CheckCudaVersionSupportsArch( @@ -273,6 +321,144 @@ << CudaVersionToString(Version) << "\n"; } +bool tools::addBCLib(Compilation &C, const ArgList &Args, + ArgStringList &CmdArgs, ArgStringList LibraryPaths, + const char *BCName) { + std::string FullName; + bool FoundLibDevice = false; + for (std::string LibraryPath : LibraryPaths) { + FullName = Args.MakeArgString(LibraryPath + "/" + BCName); + if (llvm::sys::fs::exists(FullName.c_str())) { + FoundLibDevice = true; + break; + } + } + if (!FoundLibDevice) + C.getDriver().Diag(diag::err_drv_no_such_file) << BCName; + CmdArgs.push_back(Args.MakeArgString(FullName)); + return FoundLibDevice; +} + +/// addEnvListWithSpaces adds command line args from an environment variable +bool tools::addEnvListWithSpaces(const ArgList &Args, ArgStringList &CmdArgs, + const char *EnvVar) { + const char *DirList = ::getenv(EnvVar); + if (!DirList) + return false; + StringRef Dirs(DirList); + if (Dirs.empty()) + return false; + StringRef::size_type Delim; + while ((Delim = Dirs.find(" ")) != StringRef::npos) { + if (Delim != 0) + CmdArgs.push_back(Args.MakeArgString(Dirs.substr(0, Delim))); + Dirs = Dirs.substr(Delim + 1); // Trim front space + } + if (!Dirs.empty()) // Last arg may have no spaces + CmdArgs.push_back(Args.MakeArgString(Dirs)); + return true; +} + +void AMDGCN::Backend::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + + assert(StringRef(JA.getOffloadingArch()).startswith("gfx") && + " unless gfx processor, backend should be clang"); + + // For amdgcn the Backend Job will call llvm-link & opt steps + ArgStringList CmdArgs; + // Add the input bc's created by compile step + for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); + it != ie; ++it) { + const InputInfo &II = *it; + CmdArgs.push_back(II.getFilename()); + } + + std::string GFXNAME = JA.getOffloadingArch(); + + ArgStringList LibraryPaths; + + // Find in -L and LIBRARY_PATH. + for (auto Arg : Args) { + if (Arg->getSpelling() == "-L") { + LibraryPaths.push_back(Args.MakeArgString( + std::string(Arg->getValue()) + "/libdevice/" + std::string(GFXNAME))); + LibraryPaths.push_back(Args.MakeArgString(Arg->getValue())); + } + } + + // add the compiler installation libdevice last so -L will override them. + LibraryPaths.push_back(Args.MakeArgString( + C.getDriver().Dir + "/../lib/libdevice/" + std::string(GFXNAME))); + + addDirectoryList(Args, LibraryPaths, "-L", "LIBRARY_PATH"); + + addBCLib(C, Args, CmdArgs, LibraryPaths, "libhiprt.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "opencl.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "ockl.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "irif.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "ocml.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_finite_only_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_daz_opt_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, + "oclc_correctly_rounded_sqrt_on.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_unsafe_math_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "hc.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_isa_version.amdgcn.bc"); + + addEnvListWithSpaces(Args, CmdArgs, "CLANG_TARGET_LINK_OPTS"); + CmdArgs.push_back("-suppress-warnings"); + + // Add an intermediate output file which is input to opt + CmdArgs.push_back("-o"); + std::string TmpName = C.getDriver().GetTemporaryPath("OPT_INPUT", "bc"); + const char *ResultingBitcodeF = + C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str())); + CmdArgs.push_back(ResultingBitcodeF); + const char *Exec = Args.MakeArgString(C.getDriver().Dir + "/llvm-link"); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + ArgStringList OptArgs; + // The input to opt is the output from llvm-link + OptArgs.push_back(ResultingBitcodeF); + // Add CLANG_TARGETOPT_OPTS override options to opt + if (!addEnvListWithSpaces(Args, OptArgs, "CLANG_TARGET_OPT_OPTS")) { + // If CLANG_TARGET_OPT_OPTS not set, add optimization arg + if (Arg *A = Args.getLastArg(options::OPT_O_Group)) { + StringRef OOpt = "3"; + if (A->getOption().matches(options::OPT_O4) || + A->getOption().matches(options::OPT_Ofast)) + OOpt = "3"; + else if (A->getOption().matches(options::OPT_O0)) + OOpt = "0"; + else if (A->getOption().matches(options::OPT_O)) { + // -Os, -Oz, and -O(anything else) map to -O2 + OOpt = llvm::StringSwitch(A->getValue()) + .Case("1", "1") + .Case("2", "2") + .Case("3", "3") + .Case("s", "2") + .Case("z", "2") + .Default("2"); + } + OptArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt)); + } + OptArgs.push_back("-S"); + const char *mcpustr = Args.MakeArgString("-mcpu=" + GFXNAME); + OptArgs.push_back(mcpustr); + OptArgs.push_back("-dce"); + OptArgs.push_back("-sroa"); + OptArgs.push_back("-globaldce"); + } + OptArgs.push_back("-o"); + OptArgs.push_back(Output.getFilename()); + const char *OptExec = Args.MakeArgString(C.getDriver().Dir + "/opt"); + C.addCommand(llvm::make_unique(JA, *this, OptExec, OptArgs, Inputs)); +} + void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, @@ -280,7 +466,9 @@ const char *LinkingOutput) const { const auto &TC = static_cast(getToolChain()); - assert(TC.getTriple().isNVPTX() && "Wrong platform"); + assert((TC.getTriple().isNVPTX() || + TC.getTriple().getArch() == llvm::Triple::amdgcn) && + "Wrong platform"); StringRef GPUArchName; // If this is an OpenMP action we need to extract the device architecture @@ -297,6 +485,41 @@ assert(gpu_arch != CudaArch::UNKNOWN && "Device action expected to have an architecture."); + // For amdgcn this job will call llc (AMD Lightning Compiler) + if (StringRef(JA.getOffloadingArch()).startswith("gfx")) { + ArgStringList CmdArgs; + for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); + it != ie; ++it) { + const InputInfo &II = *it; + CmdArgs.push_back(II.getFilename()); + } + CmdArgs.push_back("-mtriple=amdgcn-amd-amdhsa"); + CmdArgs.push_back("-filetype=obj"); + addEnvListWithSpaces(Args, CmdArgs, "CLANG_TARGET_LLC_OPTS"); + std::string GFXNAME = JA.getOffloadingArch(); + CmdArgs.push_back(Args.MakeArgString("-mcpu=" + GFXNAME)); + CmdArgs.push_back("-o"); + std::string TmpName = C.getDriver().GetTemporaryPath("LC_OUTPUT", "o"); + const char *llcOutputFile = + C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str())); + CmdArgs.push_back(llcOutputFile); + const char *Exec = Args.MakeArgString(C.getDriver().Dir + "/llc"); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + ArgStringList CmdArgs2; + CmdArgs2.push_back("-flavor"); + CmdArgs2.push_back("gnu"); + CmdArgs2.push_back("--no-undefined"); + CmdArgs2.push_back("-shared"); + // The output from ld.lld is an HSA code object file + CmdArgs2.push_back("-o"); + CmdArgs2.push_back(Output.getFilename()); + CmdArgs2.push_back(llcOutputFile); + const char *lld = Args.MakeArgString(C.getDriver().Dir + "/lld"); + C.addCommand(llvm::make_unique(JA, *this, lld, CmdArgs2, Inputs)); + return; + } + // Check that our installation's ptxas supports gpu_arch. if (!Args.hasArg(options::OPT_no_cuda_version_check)) { TC.CudaInstallation.CheckCudaVersionSupportsArch(gpu_arch); @@ -387,7 +610,51 @@ const char *LinkingOutput) const { const auto &TC = static_cast(getToolChain()); - assert(TC.getTriple().isNVPTX() && "Wrong platform"); + assert((TC.getTriple().isNVPTX() || + TC.getTriple().getArch() == llvm::Triple::amdgcn) && + "Wrong platform"); + + // This job builds composite cubin file from each output of the assemble step + // There are 2 scenarios and the command(s) needed for each. + // 1 amdgpu targets - Run clang tool "clang-assemble-fatbin" + // 2 nvptx targets - Run cuda SDK "fatbinary" program + bool found_amdgcn = false; + bool found_nvptx = false; + for (const auto &II : Inputs) { + if (StringRef(II.getAction()->getOffloadingArch()).startswith("gfx")) + found_amdgcn = true; + else + found_nvptx = true; + } + + // amdgcn targets, call clang-assemble-fatbin ------------------------ + if (found_amdgcn) { + + ArgStringList CmdArgs; + CmdArgs.push_back(Args.MakeArgString("-type=o")); + + std::string targets = "-targets=host-x86_64-unknown-linux"; + std::string inputs = "-inputs=/dev/null"; + for (const auto &II : Inputs) { + if (II.getType() != types::TY_PP_Asm) { + targets = targets + ",openmp-amdgcn--amdhsa-" + + StringRef(II.getAction()->getOffloadingArch()).str(); + inputs = inputs + "," + II.getFilename(); + } + } + CmdArgs.push_back(Args.MakeArgString(targets)); + CmdArgs.push_back(Args.MakeArgString(inputs)); + + auto outputArgString = Args.MakeArgString( + std::string("-outputs=").append(Output.getFilename())); + CmdArgs.push_back(outputArgString); + + const char *Exec = + Args.MakeArgString(C.getDriver().Dir + "/clang-offload-bundler"); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + return; + } + // nvptx targets found, so just call fatbinary ----------------------- ArgStringList CmdArgs; CmdArgs.push_back("--cuda"); @@ -541,7 +808,8 @@ CC1Args.push_back("-fcuda-is-device"); if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero, - options::OPT_fno_cuda_flush_denormals_to_zero, false)) + options::OPT_fno_cuda_flush_denormals_to_zero, + false)) CC1Args.push_back("-fcuda-flush-denormals-to-zero"); if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, @@ -563,10 +831,17 @@ DriverArgs.hasArg(options::OPT_S)) return; - getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch; + if (GpuArch.startswith("gfx")) + getDriver().Diag(diag::err_drv_no_hip_libdevice) << GpuArch; + else + getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch; return; } + // Do not add -link-cuda-bitcode or ptx42 features if gfx + if (GpuArch.startswith("gfx")) + return; + CC1Args.push_back("-mlink-cuda-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); @@ -704,11 +979,16 @@ if (!BoundArch.empty()) { DAL->eraseArg(options::OPT_march_EQ); - DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch); + DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), + BoundArch); } return DAL; } +Tool *CudaToolChain::buildBackend() const { + return new tools::AMDGCN::Backend(*this); +} + Tool *CudaToolChain::buildAssembler() const { return new tools::NVPTX::Assembler(*this); } Index: lib/Driver/ToolChains/Gnu.cpp =================================================================== --- lib/Driver/ToolChains/Gnu.cpp +++ lib/Driver/ToolChains/Gnu.cpp @@ -471,6 +471,25 @@ // The profile runtime also needs access to system libraries. getToolChain().addProfileRTLibs(Args, CmdArgs); + bool Is_amdgcn = false; + for (Arg *A : Args) { + if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ) && + StringRef(A->getValue()).startswith("gfx")) { + Is_amdgcn = true; + break; + } + } + + // The hip runtimes are installed in lib dir of compiler installation + if (Args.hasArg(options::OPT_x) && + (((StringRef)Args.getLastArg(options::OPT_x)->getValue()) == "hip") && + Is_amdgcn) { + CmdArgs.push_back("-lhip_hcc"); + CmdArgs.push_back("-lhiprt"); + CmdArgs.push_back("-rpath"); + CmdArgs.push_back(Args.MakeArgString(D.Dir + "/../lib")); + } + if (D.CCCIsCXX() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { if (ToolChain.ShouldLinkCXXStdlib(Args)) { Index: lib/Driver/Types.cpp =================================================================== --- lib/Driver/Types.cpp +++ lib/Driver/Types.cpp @@ -102,6 +102,9 @@ case TY_CL: case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE: + case TY_HIP: + case TY_PP_HIP: + case TY_HIP_DEVICE: case TY_ObjC: case TY_PP_ObjC: case TY_PP_ObjC_Alias: case TY_CXX: case TY_PP_CXX: case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias: @@ -141,6 +144,9 @@ case TY_ObjCXXHeader: case TY_PP_ObjCXXHeader: case TY_CXXModule: case TY_PP_CXXModule: case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE: + case TY_HIP: + case TY_PP_HIP: + case TY_HIP_DEVICE: return true; } } @@ -166,6 +172,9 @@ case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE: + case TY_HIP: + case TY_PP_HIP: + case TY_HIP_DEVICE: return true; } } Index: lib/Frontend/CompilerInstance.cpp =================================================================== --- lib/Frontend/CompilerInstance.cpp +++ lib/Frontend/CompilerInstance.cpp @@ -410,7 +410,8 @@ // triple (the host triple) to initialize our header search, since we need to // find the host headers in order to compile the CUDA code. const llvm::Triple *HeaderSearchTriple = &PP->getTargetInfo().getTriple(); - if (PP->getTargetInfo().getTriple().getOS() == llvm::Triple::CUDA && + if ((PP->getTargetInfo().getTriple().getOS() == llvm::Triple::CUDA || + PP->getTargetInfo().getTriple().getOS() == llvm::Triple::AMDHSA) && PP->getAuxTargetInfo()) HeaderSearchTriple = &PP->getAuxTargetInfo()->getTriple(); Index: test/Driver/cuda-phases.cu =================================================================== --- test/Driver/cuda-phases.cu +++ test/Driver/cuda-phases.cu @@ -7,22 +7,26 @@ // REQUIRES: clang-driver // REQUIRES: powerpc-registered-target // REQUIRES: nvptx-registered-target - +// REQUIRES: amdgpu-registered-target // // Test single gpu architecture with complete compilation. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \ -// RUN: | FileCheck -check-prefix=BIN %s -// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s +// BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-cuda) +// BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-cuda) +// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda) // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) -// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) -// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) -// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) -// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object -// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler +// BIN_NV-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH:sm_30]]) +// BIN_AMD-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH:gfx803]]) +// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P7]]}, object +// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P6]]}, assembler // BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda) // BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir // BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda) @@ -34,13 +38,15 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \ // RUN: | FileCheck -check-prefix=ASM %s -// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefix=ASM %s +// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]]) +// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]]) +// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P3]]}, assembler +// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-cuda) +// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (host-cuda) // ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda) // ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-cuda) @@ -49,23 +55,25 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ // RUN: | FileCheck -check-prefix=BIN2 %s -// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \ +// RUN: | FileCheck -check-prefix=BIN2 %s +// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda) +// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda) // BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) -// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) -// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) -// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) -// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object -// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler -// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35) -// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35) -// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35) -// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35) -// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object -// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler +// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH1:sm_30|gfx803]]) +// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH1]])" {[[P7]]}, object +// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH1]])" {[[P6]]}, assembler +// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH2:sm_35|gfx900]]) +// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P14]]}, object +// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P13]]}, assembler // BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda) // BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir // BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda) @@ -77,18 +85,20 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \ // RUN: | FileCheck -check-prefix=ASM2 %s -// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) -// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) -// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) -// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler -// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefix=ASM2 %s +// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH1:sm_30|gfx803]]) +// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH1]]) +// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH1]]) +// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH1]]) +// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH1]])" {[[P3]]}, assembler +// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH2:sm_35|gfx900]]) +// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-cuda, [[ARCH2]]) +// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, [[ARCH2]]) +// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, [[ARCH2]]) +// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P8]]}, assembler +// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-cuda) +// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (host-cuda) // ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda) // ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-cuda) @@ -98,8 +108,10 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \ // RUN: | FileCheck -check-prefix=HBIN %s -// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefix=HBIN %s +// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda) +// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda) // HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) // HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) // HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda) @@ -110,8 +122,10 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=HASM %s -// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=HASM %s +// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda) +// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda) // HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) // HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) @@ -121,8 +135,10 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \ // RUN: | FileCheck -check-prefix=HBIN2 %s -// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefix=HBIN2 %s +// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda) +// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda) // HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) // HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) // HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda) @@ -134,8 +150,10 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=HASM2 %s -// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=HASM2 %s +// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda) +// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda) // HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) // HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) @@ -145,12 +163,14 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \ // RUN: | FileCheck -check-prefix=DBIN %s -// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) -// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefix=DBIN %s +// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]]) +// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P4]]}, object // // Test single gpu architecture up to the assemble phase in device-only @@ -158,11 +178,13 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=DASM %s -// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=DASM %s +// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]]) +// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]]) +// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P3]]}, assembler // // Test two gpu architectures with complete compilation in device-only @@ -170,18 +192,20 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \ // RUN: | FileCheck -check-prefix=DBIN2 %s -// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) -// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object -// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35) -// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35) -// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35) -// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35) -// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefix=DBIN2 %s +// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]]) +// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P4]]}, object +// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH2:sm_35|gfx900]]) +// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, [[T]]-cpp-output, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P10]]}, object // // Test two gpu architectures up to the assemble phase in device-only @@ -189,13 +213,15 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=DASM2 %s -// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) -// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) -// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) -// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=DASM2 %s +// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]]) +// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]]) +// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH]])" {[[P3]]}, assembler +// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH2:sm_35|gfx900]]) +// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-cuda, [[ARCH2]]) +// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, [[ARCH2]]) +// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, [[ARCH2]]) +// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:[[ARCH2]])" {[[P8]]}, assembler