Index: include/clang/Basic/DiagnosticDriverKinds.td =================================================================== --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -31,6 +31,8 @@ def err_drv_no_cuda_libdevice : Error< "cannot find libdevice for %0. Provide path to different CUDA installation " "via --cuda-path, or pass -nocudalib to build without linking with libdevice.">; +def err_drv_no_hip_libdevice : Error< + "cannot find libdevice for %0. Please install device library.">; def err_drv_cuda_version_unsupported : Error< "GPU arch %0 is supported by CUDA versions between %1 and %2 (inclusive), " "but installation at %3 is %4. Use --cuda-path to specify a different CUDA " Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -547,7 +547,10 @@ HelpText<"Compile CUDA code for both host and device (default). Has no " "effect on non-CUDA compilations.">; def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>, - HelpText<"CUDA GPU architecture (e.g. sm_35). May be specified more than once.">; + HelpText<"CUDA/HIP GPU architecture (e.g. sm_35). May be specified more than once.">; +def : Joined<["--"], "offload-arch=">, Alias; +def offload_archs : Joined<["--"], "offload-archs=">, Flags<[DriverOption]>, + HelpText<"List of offload architectures for CUDA/HIP/OpenMP (e.g. sm_35,gfx803).">; def no_cuda_gpu_arch_EQ : Joined<["--"], "no-cuda-gpu-arch=">, Flags<[DriverOption]>, HelpText<"Remove GPU architecture (e.g. sm_35) from the list of GPUs to compile for. " "'all' resets the list to its default value.">; Index: include/clang/Driver/ToolChain.h =================================================================== --- include/clang/Driver/ToolChain.h +++ include/clang/Driver/ToolChain.h @@ -121,11 +121,13 @@ path_list ProgramPaths; mutable std::unique_ptr Clang; + mutable std::unique_ptr DeviceLibraryLink; mutable std::unique_ptr Assemble; mutable std::unique_ptr Link; mutable std::unique_ptr OffloadBundler; Tool *getClang() const; + Tool *getDeviceLibraryLink() const; Tool *getAssemble() const; Tool *getLink() const; Tool *getClangAs() const; @@ -151,6 +153,7 @@ void setTripleEnvironment(llvm::Triple::EnvironmentType Env); virtual Tool *buildAssembler() const; + virtual Tool *buildDeviceLibraryLinker() const; virtual Tool *buildLinker() const; virtual Tool *getTool(Action::ActionClass AC) const; Index: include/clang/Driver/Types.def =================================================================== --- include/clang/Driver/Types.def +++ include/clang/Driver/Types.def @@ -46,6 +46,9 @@ TYPE("cuda-cpp-output", PP_CUDA, INVALID, "cui", "u") TYPE("cuda", CUDA, PP_CUDA, "cu", "u") TYPE("cuda", CUDA_DEVICE, PP_CUDA, "cu", "") +TYPE("hip-cpp-output", PP_HIP, INVALID, "cui", "u") +TYPE("hip", HIP, PP_HIP, "cu", "u") +TYPE("hip", HIP_DEVICE, PP_HIP, "cu", "") TYPE("objective-c-cpp-output", PP_ObjC, INVALID, "mi", "u") TYPE("objc-cpp-output", PP_ObjC_Alias, INVALID, "mi", "u") TYPE("objective-c", ObjC, PP_ObjC, "m", "u") Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -538,13 +538,25 @@ // CUDA // // We need to generate a CUDA toolchain if any of the inputs has a CUDA type. + // ToDo: Handle mixed CUDA/HIP input files and -x hip option. Diagnose + // CUDA on amdgcn and HIP on nvptx. if (llvm::any_of(Inputs, [](std::pair &I) { return types::isCuda(I.first); })) { const ToolChain *HostTC = C.getSingleOffloadToolChain(); const llvm::Triple &HostTriple = HostTC->getTriple(); - llvm::Triple CudaTriple(HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda" - : "nvptx-nvidia-cuda"); + StringRef DeviceTripleStr; + if (C.getInputArgs().hasArg(options::OPT_x) && + StringRef(C.getInputArgs().getLastArg(options::OPT_x)->getValue()) == + "hip") { + // HIP is only supported on amdgcn. + DeviceTripleStr = "amdgcn-amd-amdhsa"; + } else { + // CUDA is only supported on nvptx. + DeviceTripleStr = HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda" + : "nvptx-nvidia-cuda"; + } + llvm::Triple CudaTriple(DeviceTripleStr); // Use the CUDA and host triples as the key into the ToolChains map, because // the device toolchain we create depends on both. auto &CudaTC = ToolChains[CudaTriple.str() + "/" + HostTriple.str()]; @@ -2249,9 +2261,10 @@ assert(!GpuArchList.empty() && "We should have at least one GPU architecture."); - // If the host input is not CUDA, we don't need to bother about this - // input. - if (IA->getType() != types::TY_CUDA) { + // If the host input is not CUDA or HIP, we don't need to bother about + // this input. + if ((IA->getType() != types::TY_CUDA) && + IA->getType() != types::TY_HIP) { // The builder will ignore this input. IsActive = false; return ABRT_Inactive; @@ -2264,9 +2277,12 @@ return ABRT_Success; // Replicate inputs for each GPU architecture. - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) - CudaDeviceActions.push_back(C.MakeAction( - IA->getInputArg(), types::TY_CUDA_DEVICE)); + auto Ty = IA->getType() == types::TY_HIP ? types::TY_HIP_DEVICE + : types::TY_CUDA_DEVICE; + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + CudaDeviceActions.push_back( + C.MakeAction(IA->getInputArg(), Ty)); + } return ABRT_Success; } @@ -2314,7 +2330,8 @@ const ToolChain *HostTC = C.getSingleOffloadToolChain(); assert(HostTC && "No toolchain for host compilation."); - if (HostTC->getTriple().isNVPTX()) { + if (HostTC->getTriple().isNVPTX() || + HostTC->getTriple().getArch() == llvm::Triple::amdgcn) { // We do not support targeting NVPTX for host compilation. Throw // an error and abort pipeline construction early so we don't trip // asserts that assume device-side compilation. @@ -3212,6 +3229,9 @@ bool SaveTemps; bool EmbedBitcode; + /// Type of the input file for the tool + types::ID InputType; + /// Get previous dependent action or null if that does not exist. If /// \a CanBeCollapsed is false, that action must be legal to collapse or /// null will be returned. @@ -3269,6 +3289,8 @@ bool canCollapsePreprocessorAction() const { return !C.getArgs().hasArg(options::OPT_no_integrated_cpp) && !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps && + (InputType != types::TY_LLVM_IR) && + (InputType != types::TY_LLVM_BC) && !C.getArgs().hasArg(options::OPT_rewrite_objc); } @@ -3293,6 +3315,11 @@ ActionInfo[I].SavedOffloadAction.end()); } + static bool isAMDGPUCUDAOffloading(const Action *A, llvm::Triple T) { + return A->isOffloading(Action::OFK_Cuda) && + (StringRef(A->getOffloadingArch()).startswith("gfx") || + T.getArch() == llvm::Triple::amdgcn); + } /// Functions that attempt to perform the combining. They detect if that is /// legal, and if so they update the inputs \a Inputs and the offload action /// that were collapsed in \a CollapsedOffloadAction. A tool that deals with @@ -3314,6 +3341,10 @@ if (!AJ || !BJ || !CJ) return nullptr; + // Cannot combine compilation with backend for amdgcn backend + if (isAMDGPUCUDAOffloading(AJ, TC.getTriple())) + return nullptr; + // Get compiler tool. const Tool *T = TC.SelectTool(*CJ); if (!T) @@ -3345,6 +3376,10 @@ if (!AJ || !BJ) return nullptr; + // Cannot combine assemble with backend for amdgcn backend + if (isAMDGPUCUDAOffloading(AJ, TC.getTriple())) + return nullptr; + // Retrieve the compile job, backend action must always be preceded by one. ActionList CompileJobOffloadActions; auto *CJ = getPrevDependentAction(BJ->getInputs(), CompileJobOffloadActions, @@ -3378,6 +3413,16 @@ if (!BJ || !CJ) return nullptr; + // Cannot combine compilation with backend for amdgcn backend. However + // it is necessary to combine when generating IR for compile-only with + // flags "-c -S -emit-llvm". If only flags "-c -S" the gcn backend is + // needed to generate linked and opt IR for llc, so do not combine. + if (isAMDGPUCUDAOffloading(BJ, TC.getTriple()) && + !(C.getArgs().hasArg(options::OPT_c) && + C.getArgs().hasArg(options::OPT_S) && + C.getArgs().hasArg(options::OPT_emit_llvm))) + return nullptr; + // Get compiler tool. const Tool *T = TC.SelectTool(*CJ); if (!T) @@ -3421,6 +3466,14 @@ EmbedBitcode(EmbedBitcode) { assert(BaseAction && "Invalid base action."); IsHostSelector = BaseAction->getOffloadingDeviceKind() == Action::OFK_None; + // Store the InputType to check if Compile and Backend can collapse + for (Arg *A : C.getInputArgs()) { + if (A->getOption().getKind() == Option::InputClass) { + const char *Value = A->getValue(); + if (const char *Ext = strrchr(Value, '.')) + InputType = TC.LookupTypeForExtension(Ext + 1); + } + } } /// Check if a chain of actions can be combined and return the tool that can @@ -3849,8 +3902,13 @@ CCGenDiagnostics) { StringRef Name = llvm::sys::path::filename(BaseInput); std::pair Split = Name.split('.'); + SmallString<128> fname(Split.first.str().c_str()); + if (!BoundArch.empty()) { + fname += "-"; + fname.append(BoundArch); + } std::string TmpName = GetTemporaryPath( - Split.first, types::getTypeTempSuffix(JA.getType(), IsCLMode())); + fname, types::getTypeTempSuffix(JA.getType(), IsCLMode())); return C.addTempFile(C.getArgs().MakeArgString(TmpName)); } @@ -3921,7 +3979,10 @@ JA.getType() == types::TY_LLVM_BC) Suffixed += ".tmp"; Suffixed += '.'; - Suffixed += Suffix; + if (((StringRef)BaseInput).endswith(".a")) + Suffixed += "a"; + else + Suffixed += Suffix; NamedOutput = C.getArgs().MakeArgString(Suffixed.c_str()); } Index: lib/Driver/SanitizerArgs.cpp =================================================================== --- lib/Driver/SanitizerArgs.cpp +++ lib/Driver/SanitizerArgs.cpp @@ -729,7 +729,8 @@ // NVPTX doesn't currently support sanitizers. Bailing out here means that // e.g. -fsanitize=address applies only to host code, which is what we want // for now. - if (TC.getTriple().isNVPTX()) + if (TC.getTriple().isNVPTX() || + TC.getTriple().getArch() == llvm::Triple::amdgcn) return; // Translate available CoverageFeatures to corresponding clang-cc1 flags. Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -253,6 +253,10 @@ return Clang.get(); } +Tool *ToolChain::buildDeviceLibraryLinker() const { + return new tools::Clang(*this); +} + Tool *ToolChain::buildAssembler() const { return new tools::ClangAs(*this); } @@ -267,6 +271,12 @@ return Assemble.get(); } +Tool *ToolChain::getDeviceLibraryLink() const { + if (!DeviceLibraryLink) + DeviceLibraryLink.reset(buildDeviceLibraryLinker()); + return DeviceLibraryLink.get(); +} + Tool *ToolChain::getClangAs() const { if (!Assemble) Assemble.reset(new tools::ClangAs(*this)); @@ -307,8 +317,9 @@ case Action::AnalyzeJobClass: case Action::MigrateJobClass: case Action::VerifyPCHJobClass: - case Action::BackendJobClass: return getClang(); + case Action::BackendJobClass: + return getDeviceLibraryLink(); case Action::OffloadBundlingJobClass: case Action::OffloadUnbundlingJobClass: @@ -406,8 +417,21 @@ } Tool *ToolChain::SelectTool(const JobAction &JA) const { - if (getDriver().ShouldUseClangCompiler(JA)) return getClang(); Action::ActionClass AC = JA.getKind(); + // The amdgcn Backend needs buildBackend() + // if ( StringRef(JA.getOffloadingArch()).startswith("gfx") && + if (JA.isOffloading(Action::OFK_Cuda) && + (StringRef(JA.getOffloadingArch()).startswith("gfx") || + (getTriple().getArch() == llvm::Triple::amdgcn)) && + (AC == Action::BackendJobClass)) { + if ((Args.hasArg(options::OPT_emit_llvm)) || + (Args.hasArg(options::OPT_emit_llvm_bc))) + return getClang(); // Dont run amdgcn backend if we just want LLVM IR + else + return getTool(AC); + }; + if (getDriver().ShouldUseClangCompiler(JA)) + return getClang(); if (AC == Action::AssembleJobClass && useIntegratedAs()) return getClangAs(); return getTool(AC); Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -2324,9 +2324,10 @@ ArgStringList &CmdArgs, bool KernelOrKext) { const llvm::Triple &EffectiveTriple = TC.getEffectiveTriple(); - // NVPTX doesn't support stack protectors; from the compiler's perspective, it - // doesn't even have a stack! - if (EffectiveTriple.isNVPTX()) + // NVPTX and GCN don't support stack protectors; from the compiler's + // perspective, it doesn't even have a stack! + if (EffectiveTriple.isNVPTX() || + EffectiveTriple.getArch() == llvm::Triple::amdgcn) return; // -stack-protector=0 is default. @@ -3496,7 +3497,8 @@ // Enable -mconstructor-aliases except on darwin, where we have to work around // a linker bug (see ), and CUDA device code, where // aliases aren't supported. - if (!RawTriple.isOSDarwin() && !RawTriple.isNVPTX()) + if (!RawTriple.isOSDarwin() && !RawTriple.isNVPTX() && + RawTriple.getArch() != llvm::Triple::amdgcn) CmdArgs.push_back("-mconstructor-aliases"); // Darwin's kernel doesn't support guard variables; just die if we Index: lib/Driver/ToolChains/Cuda.h =================================================================== --- lib/Driver/ToolChains/Cuda.h +++ lib/Driver/ToolChains/Cuda.h @@ -127,6 +127,53 @@ }; } // end namespace NVPTX + +namespace AMDGCN { +// Run llc, the AMDGPU assembler. +class LLVM_LIBRARY_VISIBILITY Assembler : public Tool { +public: + Assembler(const ToolChain &TC) + : Tool("AMDGCN::Assembler", "llc", TC, RF_Full, llvm::sys::WEM_UTF8, + "--options-file") {} + + bool hasIntegratedCPP() const override { return false; } + + void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; + +// Runs clang-offload-bundler, which combines AMDGCN object files into a single +// output file. +class LLVM_LIBRARY_VISIBILITY Linker : public Tool { +public: + Linker(const ToolChain &TC) + : Tool("AMDGCN::Linker", "clang-offload-bundler", TC, RF_Full, + llvm::sys::WEM_UTF8, "--options-file") {} + + bool hasIntegratedCPP() const override { return false; } + + void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; + +// For amdgcn the device library linker is llvm-link + opt. +class LLVM_LIBRARY_VISIBILITY DeviceLibraryLinker : public Tool { +public: + DeviceLibraryLinker(const ToolChain &TC) + : Tool("AMDGCN::DeviceLibraryLinker", "device-library-linker", TC, + RF_Full, llvm::sys::WEM_UTF8, "--options-file") {} + virtual bool hasIntegratedCPP() const override { return false; } + virtual void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; +} // end namespace AMDGCN } // end namespace tools namespace toolchains { @@ -184,6 +231,7 @@ CudaInstallationDetector CudaInstallation; protected: + Tool *buildDeviceLibraryLinker() const override; // for amdgcn, link and opt Tool *buildAssembler() const override; // ptxas Tool *buildLinker() const override; // fatbinary (ok, not really a linker) Index: lib/Driver/ToolChains/Cuda.cpp =================================================================== --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -222,10 +222,48 @@ IsValid = true; break; } + + ArgStringList LibraryPaths; + for (auto Arg : Args) { + if (Arg->getSpelling() == "-L") { + llvm::Twine DevicePath = Twine(Arg->getValue()).concat("/libdevice/"); + if (D.getVFS().exists(DevicePath)) { + LibraryPaths.push_back(Args.MakeArgString(DevicePath)); + } else { + if (D.getVFS().exists(Arg->getValue())) + LibraryPaths.push_back(Arg->getValue()); + } + } + } + LibraryPaths.push_back(Args.MakeArgString(D.Dir + "/../lib/libdevice/")); + + // Search for GCN Device Libraries + for (Arg *A : Args) { + if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ) && + StringRef(A->getValue()).startswith("gfx")) { + StringRef GFXNAME = A->getValue(); + for (auto LP : LibraryPaths) { + StringRef GCNPath = Args.MakeArgString(LP + GFXNAME); + if (D.getVFS().exists(GCNPath)) { + LibDeviceMap[GFXNAME.str()] = GCNPath; + break; + } + } + } + } } void CudaInstallationDetector::AddCudaIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { + + if (DriverArgs.hasArg(options::OPT_x) && + (StringRef(DriverArgs.getLastArg(options::OPT_x)->getValue()) == + "hip")) { + // HIP needs c++11. + CC1Args.push_back("-std=c++11"); + // Skip CUDA includes for HIP. + return; + } if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { // Add cuda_wrappers/* to our system include path. This lets us wrap // standard library headers. @@ -273,6 +311,201 @@ << CudaVersionToString(Version) << "\n"; } +static bool addBCLib(Compilation &C, const ArgList &Args, + ArgStringList &CmdArgs, ArgStringList LibraryPaths, + const char *BCName) { + std::string FullName; + bool FoundLibDevice = false; + for (std::string LibraryPath : LibraryPaths) { + FullName = Args.MakeArgString(LibraryPath + "/" + BCName); + if (llvm::sys::fs::exists(FullName.c_str())) { + FoundLibDevice = true; + break; + } + } + if (!FoundLibDevice) + C.getDriver().Diag(diag::err_drv_no_such_file) << BCName; + CmdArgs.push_back(Args.MakeArgString(FullName)); + return FoundLibDevice; +} + +void AMDGCN::Assembler::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + const auto &TC = + static_cast(getToolChain()); + assert(TC.getTriple().getArch() == llvm::Triple::amdgcn && "Wrong platform"); + + ArgStringList CmdArgs; + for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); + it != ie; ++it) { + const InputInfo &II = *it; + CmdArgs.push_back(II.getFilename()); + } + CmdArgs.push_back("-mtriple=amdgcn-amd-amdhsa"); + CmdArgs.push_back("-filetype=obj"); + std::string GFXNAME = JA.getOffloadingArch(); + CmdArgs.push_back(Args.MakeArgString("-mcpu=" + GFXNAME)); + CmdArgs.push_back("-o"); + std::string TmpName = C.getDriver().GetTemporaryPath("LC_OUTPUT", "o"); + const char *llcOutputFile = + C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str())); + CmdArgs.push_back(llcOutputFile); + const char *Exec = Args.MakeArgString(C.getDriver().Dir + "/llc"); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + ArgStringList CmdArgs2; + CmdArgs2.push_back("-flavor"); + CmdArgs2.push_back("gnu"); + CmdArgs2.push_back("--no-undefined"); + CmdArgs2.push_back("-shared"); + // The output from ld.lld is an HSA code object file + CmdArgs2.push_back("-o"); + CmdArgs2.push_back(Output.getFilename()); + CmdArgs2.push_back(llcOutputFile); + const char *lld = Args.MakeArgString(C.getDriver().Dir + "/lld"); + C.addCommand(llvm::make_unique(JA, *this, lld, CmdArgs2, Inputs)); + return; +} + +// All inputs to this linker must be from CudaDeviceActions, as we need to look +// at the Inputs' Actions in order to figure out which GPU architecture they +// correspond to. +void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + const auto &TC = + static_cast(getToolChain()); + assert(TC.getTriple().getArch() == llvm::Triple::amdgcn && "Wrong platform"); + + ArgStringList CmdArgs; + CmdArgs.push_back(Args.MakeArgString("-type=o")); + + // ToDo: Remove the dummy host binary entry which is required by + // clang-offload-bundler. + std::string targets = "-targets=host-x86_64-uknown-linux"; + std::string inputs = "-inputs=/dev/null"; + for (const auto &II : Inputs) { + if (II.getType() != types::TY_PP_Asm) { + // ToDo: Teach clang-offload-bundler to recognize hip. + targets = targets + ",openmp-amdgcn--amdhsa-" + + StringRef(II.getAction()->getOffloadingArch()).str(); + inputs = inputs + "," + II.getFilename(); + } + } + CmdArgs.push_back(Args.MakeArgString(targets)); + CmdArgs.push_back(Args.MakeArgString(inputs)); + + auto outputArgString = + Args.MakeArgString(std::string("-outputs=").append(Output.getFilename())); + CmdArgs.push_back(outputArgString); + + const char *Exec = + Args.MakeArgString(C.getDriver().Dir + "/clang-offload-bundler"); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + return; +} + +void AMDGCN::DeviceLibraryLinker::ConstructJob( + Compilation &C, const JobAction &JA, const InputInfo &Output, + const InputInfoList &Inputs, const ArgList &Args, + const char *LinkingOutput) const { + + assert(StringRef(JA.getOffloadingArch()).startswith("gfx") && + " unless gfx processor, backend should be clang"); + + // For amdgcn the Backend Job will call llvm-link & opt steps + ArgStringList CmdArgs; + // Add the input bc's created by compile step + for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); + it != ie; ++it) { + const InputInfo &II = *it; + CmdArgs.push_back(II.getFilename()); + } + + std::string GFXNAME = JA.getOffloadingArch(); + + ArgStringList LibraryPaths; + + // Find in -L and LIBRARY_PATH. + for (auto Arg : Args) { + if (Arg->getSpelling() == "-L") { + LibraryPaths.push_back(Args.MakeArgString( + std::string(Arg->getValue()) + "/libdevice/" + std::string(GFXNAME))); + LibraryPaths.push_back(Args.MakeArgString(Arg->getValue())); + } + } + + // add the compiler installation libdevice last so -L will override them. + LibraryPaths.push_back(Args.MakeArgString( + C.getDriver().Dir + "/../lib/libdevice/" + std::string(GFXNAME))); + + addDirectoryList(Args, LibraryPaths, "-L", "LIBRARY_PATH"); + + addBCLib(C, Args, CmdArgs, LibraryPaths, "libhiprt.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "opencl.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "ockl.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "irif.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "ocml.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_finite_only_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_daz_opt_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, + "oclc_correctly_rounded_sqrt_on.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_unsafe_math_off.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "hc.amdgcn.bc"); + addBCLib(C, Args, CmdArgs, LibraryPaths, "oclc_isa_version.amdgcn.bc"); + + CmdArgs.push_back("-suppress-warnings"); + + // Add an intermediate output file which is input to opt + CmdArgs.push_back("-o"); + std::string TmpName = C.getDriver().GetTemporaryPath("OPT_INPUT", "bc"); + const char *ResultingBitcodeF = + C.addTempFile(C.getArgs().MakeArgString(TmpName.c_str())); + CmdArgs.push_back(ResultingBitcodeF); + const char *Exec = Args.MakeArgString(C.getDriver().Dir + "/llvm-link"); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); + + ArgStringList OptArgs; + // The input to opt is the output from llvm-link. + OptArgs.push_back(ResultingBitcodeF); + // Pass optimization arg to opt. + if (Arg *A = Args.getLastArg(options::OPT_O_Group)) { + StringRef OOpt = "3"; + if (A->getOption().matches(options::OPT_O4) || + A->getOption().matches(options::OPT_Ofast)) + OOpt = "3"; + else if (A->getOption().matches(options::OPT_O0)) + OOpt = "0"; + else if (A->getOption().matches(options::OPT_O)) { + // -Os, -Oz, and -O(anything else) map to -O2 + OOpt = llvm::StringSwitch(A->getValue()) + .Case("1", "1") + .Case("2", "2") + .Case("3", "3") + .Case("s", "2") + .Case("z", "2") + .Default("2"); + } + OptArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt)); + + OptArgs.push_back("-S"); + const char *mcpustr = Args.MakeArgString("-mcpu=" + GFXNAME); + OptArgs.push_back(mcpustr); + OptArgs.push_back("-dce"); + OptArgs.push_back("-sroa"); + OptArgs.push_back("-globaldce"); + } + OptArgs.push_back("-o"); + OptArgs.push_back(Output.getFilename()); + const char *OptExec = Args.MakeArgString(C.getDriver().Dir + "/opt"); + C.addCommand(llvm::make_unique(JA, *this, OptExec, OptArgs, Inputs)); +} + void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, @@ -563,10 +796,17 @@ DriverArgs.hasArg(options::OPT_S)) return; - getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch; + if (GpuArch.startswith("gfx")) + getDriver().Diag(diag::err_drv_no_hip_libdevice) << GpuArch; + else + getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch; return; } + // Do not add -link-cuda-bitcode or ptx42 features if gfx + if (GpuArch.startswith("gfx")) + return; + CC1Args.push_back("-mlink-cuda-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); @@ -709,13 +949,21 @@ return DAL; } +Tool *CudaToolChain::buildDeviceLibraryLinker() const { + return new tools::AMDGCN::DeviceLibraryLinker(*this); +} + Tool *CudaToolChain::buildAssembler() const { + if (getTriple().getArch() == llvm::Triple::amdgcn) + return new tools::AMDGCN::Assembler(*this); return new tools::NVPTX::Assembler(*this); } Tool *CudaToolChain::buildLinker() const { if (OK == Action::OFK_OpenMP) return new tools::NVPTX::OpenMPLinker(*this); + if (getTriple().getArch() == llvm::Triple::amdgcn) + return new tools::AMDGCN::Linker(*this); return new tools::NVPTX::Linker(*this); } Index: lib/Driver/ToolChains/Gnu.cpp =================================================================== --- lib/Driver/ToolChains/Gnu.cpp +++ lib/Driver/ToolChains/Gnu.cpp @@ -471,6 +471,12 @@ // The profile runtime also needs access to system libraries. getToolChain().addProfileRTLibs(Args, CmdArgs); + // The hip runtime is installed in lib dir of compiler installation. + if (Args.hasArg(options::OPT_x) && + StringRef(Args.getLastArg(options::OPT_x)->getValue()) == "hip") { + CmdArgs.push_back("-lhip_hcc"); + } + if (D.CCCIsCXX() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { if (ToolChain.ShouldLinkCXXStdlib(Args)) { Index: lib/Driver/Types.cpp =================================================================== --- lib/Driver/Types.cpp +++ lib/Driver/Types.cpp @@ -102,6 +102,9 @@ case TY_CL: case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE: + case TY_HIP: + case TY_PP_HIP: + case TY_HIP_DEVICE: case TY_ObjC: case TY_PP_ObjC: case TY_PP_ObjC_Alias: case TY_CXX: case TY_PP_CXX: case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias: @@ -141,6 +144,9 @@ case TY_ObjCXXHeader: case TY_PP_ObjCXXHeader: case TY_CXXModule: case TY_PP_CXXModule: case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE: + case TY_HIP: + case TY_PP_HIP: + case TY_HIP_DEVICE: return true; } } @@ -166,6 +172,9 @@ case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE: + case TY_HIP: + case TY_PP_HIP: + case TY_HIP_DEVICE: return true; } } Index: lib/Frontend/CompilerInstance.cpp =================================================================== --- lib/Frontend/CompilerInstance.cpp +++ lib/Frontend/CompilerInstance.cpp @@ -410,7 +410,8 @@ // triple (the host triple) to initialize our header search, since we need to // find the host headers in order to compile the CUDA code. const llvm::Triple *HeaderSearchTriple = &PP->getTargetInfo().getTriple(); - if (PP->getTargetInfo().getTriple().getOS() == llvm::Triple::CUDA && + if ((PP->getTargetInfo().getTriple().getOS() == llvm::Triple::CUDA || + PP->getTargetInfo().getTriple().getOS() == llvm::Triple::AMDHSA) && PP->getAuxTargetInfo()) HeaderSearchTriple = &PP->getAuxTargetInfo()->getTriple(); Index: test/Driver/cuda-phases.cu =================================================================== --- test/Driver/cuda-phases.cu +++ test/Driver/cuda-phases.cu @@ -7,24 +7,29 @@ // REQUIRES: clang-driver // REQUIRES: powerpc-registered-target // REQUIRES: nvptx-registered-target - +// REQUIRES: amdgpu-registered-target // // Test single gpu architecture with complete compilation. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \ -// RUN: | FileCheck -check-prefix=BIN %s -// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s +// BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-cuda) +// BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-cuda) +// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda) // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) -// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) -// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) -// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) -// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object -// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler +// BIN_NV-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH:sm_30]]) +// BIN_AMD-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH:gfx803]]) +// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, [[ARCH]]) +// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, [[ARCH]]) +// BIN_NV-DAG: [[P8:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P7]]}, object +// BIN_AMD-DAG: [[P8:[0-9]+]]: offload, "device-cuda ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P7]]}, object +// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH]])" {[[P6]]}, assembler // BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda) -// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir +// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda ([[TRIPLE]])" {[[P10]]}, ir // BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda) // BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-cuda) // BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-cuda) @@ -34,13 +39,15 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \ // RUN: | FileCheck -check-prefix=ASM %s -// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefix=ASM %s +// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]]) +// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]]) +// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler +// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-cuda) +// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (host-cuda) // ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda) // ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-cuda) @@ -49,25 +56,27 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ // RUN: | FileCheck -check-prefix=BIN2 %s -// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \ +// RUN: | FileCheck -check-prefix=BIN2 %s +// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda) +// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda) // BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) -// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) -// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) -// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) -// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) -// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object -// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler -// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35) -// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35) -// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35) -// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35) -// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object -// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler +// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH1:sm_30|gfx803]]) +// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, [[ARCH1]]) +// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P7]]}, object +// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH1]])" {[[P6]]}, assembler +// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH2:sm_35|gfx900]]) +// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, [[ARCH2]]) +// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH2]])" {[[P14]]}, object +// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH2]])" {[[P13]]}, assembler // BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda) -// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir +// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda ([[TRIPLE]])" {[[P17]]}, ir // BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda) // BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-cuda) // BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-cuda) @@ -77,18 +86,20 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \ // RUN: | FileCheck -check-prefix=ASM2 %s -// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) -// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) -// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) -// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler -// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefix=ASM2 %s +// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH1:sm_30|gfx803]]) +// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH1]]) +// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH1]]) +// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH1]]) +// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P3]]}, assembler +// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH2:sm_35|gfx900]]) +// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-cuda, [[ARCH2]]) +// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, [[ARCH2]]) +// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, [[ARCH2]]) +// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler +// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-cuda) +// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (host-cuda) // ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda) // ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-cuda) @@ -98,8 +109,10 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \ // RUN: | FileCheck -check-prefix=HBIN %s -// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefix=HBIN %s +// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda) +// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda) // HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) // HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) // HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda) @@ -110,8 +123,10 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=HASM %s -// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=HASM %s +// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda) +// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda) // HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) // HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) @@ -121,8 +136,10 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \ // RUN: | FileCheck -check-prefix=HBIN2 %s -// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefix=HBIN2 %s +// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda) +// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda) // HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) // HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) // HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda) @@ -134,8 +151,10 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=HASM2 %s -// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=HASM2 %s +// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (host-cuda) +// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-cuda) // HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) // HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) @@ -145,12 +164,14 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \ // RUN: | FileCheck -check-prefix=DBIN %s -// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) -// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefix=DBIN %s +// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]]) +// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, [[ARCH]]) +// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P4]]}, object // // Test single gpu architecture up to the assemble phase in device-only @@ -158,11 +179,13 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=DASM %s -// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=DASM %s +// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]]) +// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]]) +// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler // // Test two gpu architectures with complete compilation in device-only @@ -170,18 +193,20 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \ // RUN: | FileCheck -check-prefix=DBIN2 %s -// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) -// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object -// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35) -// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35) -// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35) -// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35) -// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefix=DBIN2 %s +// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]]) +// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, [[ARCH]]) +// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P4]]}, object +// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH2:sm_35|gfx900]]) +// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, [[T]]-cpp-output, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, [[ARCH2]]) +// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH2]])" {[[P10]]}, object // // Test two gpu architectures up to the assemble phase in device-only @@ -189,13 +214,15 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=DASM2 %s -// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) -// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) -// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) -// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler -// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) -// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) -// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) -// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler +// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=DASM2 %s +// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda|hip]], (device-cuda, [[ARCH:sm_30|gfx803]]) +// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-cuda, [[ARCH]]) +// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, [[ARCH]]) +// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, [[ARCH]]) +// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler +// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-cuda, [[ARCH2:sm_35|gfx900]]) +// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-cuda, [[ARCH2]]) +// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, [[ARCH2]]) +// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, [[ARCH2]]) +// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler