diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -6039,6 +6039,9 @@ case llvm::Triple::Solaris: TC = std::make_unique(*this, Target, Args); break; + case llvm::Triple::CUDA: + TC = std::make_unique(*this, Target, Args); + break; case llvm::Triple::AMDHSA: TC = std::make_unique(*this, Target, Args); break; @@ -6158,11 +6161,6 @@ } } - // Intentionally omitted from the switch above: llvm::Triple::CUDA. CUDA - // compiles always need two toolchains, the CUDA toolchain and the host - // toolchain. So the only valid way to create a CUDA toolchain is via - // CreateOffloadingDeviceToolChains. - return *TC; } diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h --- a/clang/lib/Driver/ToolChains/Cuda.h +++ b/clang/lib/Driver/ToolChains/Cuda.h @@ -95,6 +95,19 @@ // Runs fatbinary, which combines GPU object files ("cubin" files) and/or PTX // assembly into a single output file. +class LLVM_LIBRARY_VISIBILITY FatBinary : public Tool { +public: + FatBinary(const ToolChain &TC) : Tool("NVPTX::Linker", "fatbinary", TC) {} + + bool hasIntegratedCPP() const override { return false; } + + void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; + +// Runs nvlink, which links GPU object files ("cubin" files) into a single file. class LLVM_LIBRARY_VISIBILITY Linker : public Tool { public: Linker(const ToolChain &TC) : Tool("NVPTX::Linker", "fatbinary", TC) {} @@ -116,7 +129,48 @@ namespace toolchains { -class LLVM_LIBRARY_VISIBILITY CudaToolChain : public ToolChain { +class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain { +public: + NVPTXToolChain(const Driver &D, const llvm::Triple &Triple, + const llvm::Triple &HostTriple, + const llvm::opt::ArgList &Args); + + NVPTXToolChain(const Driver &D, const llvm::Triple &Triple, + const llvm::opt::ArgList &Args); + + llvm::opt::DerivedArgList * + TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch, + Action::OffloadKind DeviceOffloadKind) const override; + + // Never try to use the integrated assembler with CUDA; always fork out to + // ptxas. + bool useIntegratedAs() const override { return false; } + bool isCrossCompiling() const override { return true; } + bool isPICDefault() const override { return false; } + bool isPIEDefault(const llvm::opt::ArgList &Args) const override { + return false; + } + bool isPICDefaultForced() const override { return false; } + bool SupportsProfiling() const override { return false; } + + bool IsMathErrnoDefault() const override { return false; } + + bool supportsDebugInfoOption(const llvm::opt::Arg *A) const override; + void adjustDebugInfoKind(codegenoptions::DebugInfoKind &DebugInfoKind, + const llvm::opt::ArgList &Args) const override; + + // NVPTX supports only DWARF2. + unsigned GetDefaultDwarfVersion() const override { return 2; } + unsigned getMaxDwarfVersion() const override { return 2; } + + CudaInstallationDetector CudaInstallation; + +protected: + Tool *buildAssembler() const override; // ptxas. + Tool *buildLinker() const override; // nvlink. +}; + +class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain { public: CudaToolChain(const Driver &D, const llvm::Triple &Triple, const ToolChain &HostTC, const llvm::opt::ArgList &Args); @@ -139,21 +193,6 @@ const llvm::opt::ArgList &DriverArgs, const JobAction &JA, const llvm::fltSemantics *FPType = nullptr) const override; - // Never try to use the integrated assembler with CUDA; always fork out to - // ptxas. - bool useIntegratedAs() const override { return false; } - bool isCrossCompiling() const override { return true; } - bool isPICDefault() const override { return false; } - bool isPIEDefault(const llvm::opt::ArgList &Args) const override { - return false; - } - bool isPICDefaultForced() const override { return false; } - bool SupportsProfiling() const override { return false; } - bool supportsDebugInfoOption(const llvm::opt::Arg *A) const override; - void adjustDebugInfoKind(codegenoptions::DebugInfoKind &DebugInfoKind, - const llvm::opt::ArgList &Args) const override; - bool IsMathErrnoDefault() const override { return false; } - void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; @@ -174,12 +213,7 @@ computeMSVCVersion(const Driver *D, const llvm::opt::ArgList &Args) const override; - unsigned GetDefaultDwarfVersion() const override { return 2; } - // NVPTX supports only DWARF2. - unsigned getMaxDwarfVersion() const override { return 2; } - const ToolChain &HostTC; - CudaInstallationDetector CudaInstallation; /// Uses nvptx-arch tool to get arch of the system GPU. Will return error /// if unable to find one. diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -369,18 +369,20 @@ const ArgList &Args, const char *LinkingOutput) const { const auto &TC = - static_cast(getToolChain()); + static_cast(getToolChain()); assert(TC.getTriple().isNVPTX() && "Wrong platform"); StringRef GPUArchName; - // If this is an OpenMP action we need to extract the device architecture - // from the -march=arch option. This option may come from -Xopenmp-target - // flag or the default value. - if (JA.isDeviceOffloading(Action::OFK_OpenMP)) { + // If this is a CUDA action we need to extract the device architecture + // from the Job's associated architecture, otherwise use the -march=arch + // option. This option may come from -Xopenmp-target flag or the default + // value. + if (JA.isDeviceOffloading(Action::OFK_Cuda)) { + GPUArchName = JA.getOffloadingArch(); + } else { GPUArchName = Args.getLastArgValue(options::OPT_march_EQ); assert(!GPUArchName.empty() && "Must have an architecture passed in."); - } else - GPUArchName = JA.getOffloadingArch(); + } // Obtain architecture from the action. CudaArch gpu_arch = StringToCudaArch(GPUArchName); @@ -442,8 +444,22 @@ CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch))); CmdArgs.push_back("--output-file"); const char *OutputFileName = Args.MakeArgString(TC.getInputFilename(Output)); - if (std::string(OutputFileName) != std::string(Output.getFilename())) + + // If we are invoking `nvlink` internally we need to output a `.cubin` file. + // Checking if the output is a temporary is the cleanest way to determine + // this. Putting this logic in `getInputFilename` isn't an option because it + // relies on the compilation. + // FIXME: This should hopefully be removed if NVIDIA updates their tooling. + if (Output.isFilename() && + llvm::find(C.getTempFiles(), Output.getFilename()) != + C.getTempFiles().end()) { + SmallString<256> Filename(Output.getFilename()); + llvm::sys::path::replace_extension(Filename, "cubin"); + OutputFileName = Args.MakeArgString(Filename); + } + if (Output.isFilename() && OutputFileName != Output.getFilename()) C.addTempFile(OutputFileName); + CmdArgs.push_back(OutputFileName); for (const auto &II : Inputs) CmdArgs.push_back(Args.MakeArgString(II.getFilename())); @@ -451,15 +467,19 @@ for (const auto &A : Args.getAllArgValues(options::OPT_Xcuda_ptxas)) CmdArgs.push_back(Args.MakeArgString(A)); - bool Relocatable = false; + bool Relocatable; if (JA.isOffloading(Action::OFK_OpenMP)) // In OpenMP we need to generate relocatable code. Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target, options::OPT_fnoopenmp_relocatable_target, /*Default=*/true); else if (JA.isOffloading(Action::OFK_Cuda)) + // In CUDA we generate relocatable code by default. Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, /*Default=*/false); + else + // Otherwise, we are compiling directly and should create linkable output. + Relocatable = true; if (Relocatable) CmdArgs.push_back("-c"); @@ -495,11 +515,11 @@ // All inputs to this linker must be from CudaDeviceActions, as we need to look // at the Inputs' Actions in order to figure out which GPU architecture they // correspond to. -void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, - const InputInfo &Output, - const InputInfoList &Inputs, - const ArgList &Args, - const char *LinkingOutput) const { +void NVPTX::FatBinary::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { const auto &TC = static_cast(getToolChain()); assert(TC.getTriple().isNVPTX() && "Wrong platform"); @@ -546,6 +566,93 @@ Exec, CmdArgs, Inputs, Output)); } +void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + const auto &TC = + static_cast(getToolChain()); + assert(TC.getTriple().isNVPTX() && "Wrong platform"); + + ArgStringList CmdArgs; + if (Output.isFilename()) { + CmdArgs.push_back("-o"); + CmdArgs.push_back(Output.getFilename()); + } else { + assert(Output.isNothing() && "Invalid output."); + } + + if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost) + CmdArgs.push_back("-g"); + + if (Args.hasArg(options::OPT_v)) + CmdArgs.push_back("-v"); + + StringRef GPUArch = Args.getLastArgValue(options::OPT_march_EQ); + assert(!GPUArch.empty() && "At least one GPU Arch required for nvlink."); + + CmdArgs.push_back("-arch"); + CmdArgs.push_back(Args.MakeArgString(GPUArch)); + + // Add paths specified in LIBRARY_PATH environment variable as -L options. + addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH"); + + // Add paths for the default clang library path. + SmallString<256> DefaultLibPath = + llvm::sys::path::parent_path(TC.getDriver().Dir); + llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME); + CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath)); + + for (const auto &II : Inputs) { + if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR || + II.getType() == types::TY_LTO_BC || II.getType() == types::TY_LLVM_BC) { + C.getDriver().Diag(diag::err_drv_no_linker_llvm_support) + << getToolChain().getTripleString(); + continue; + } + + // Currently, we only pass the input files to the linker, we do not pass + // any libraries that may be valid only for the host. + if (!II.isFilename()) + continue; + + // The 'nvlink' application performs RDC-mode linking when given a '.o' + // file and device linking when given a '.cubin' file. We always want to + // perform device linking, so just rename any '.o' files. + // FIXME: This should hopefully be removed if NVIDIA updates their tooling. + auto InputFile = getToolChain().getInputFilename(II); + if (llvm::sys::path::extension(InputFile) != ".cubin") { + // If there are no actions above this one then this is direct input and we + // can copy it. Otherwise the input is internal so a `.cubin` file should + // exist. + if (II.getAction() && II.getAction()->getInputs().size() == 0) { + const char *CubinF = + Args.MakeArgString(getToolChain().getDriver().GetTemporaryPath( + llvm::sys::path::stem(InputFile), "cubin")); + if (std::error_code EC = + llvm::sys::fs::copy_file(InputFile, C.addTempFile(CubinF))) + continue; + + CmdArgs.push_back(CubinF); + } else { + SmallString<256> Filename(InputFile); + llvm::sys::path::replace_extension(Filename, "cubin"); + CmdArgs.push_back(Args.MakeArgString(Filename)); + } + } else { + CmdArgs.push_back(Args.MakeArgString(InputFile)); + } + } + + C.addCommand(std::make_unique( + JA, *this, + ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8, + "--options-file"}, + Args.MakeArgString(getToolChain().GetProgramPath("nvlink")), CmdArgs, + Inputs, Output)); +} + void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple, const llvm::opt::ArgList &Args, std::vector &Features) { @@ -588,14 +695,13 @@ Features.push_back(PtxFeature); } -/// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary, -/// which isn't properly a linker but nonetheless performs the step of stitching -/// together object files from the assembler into a single blob. - -CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple, - const ToolChain &HostTC, const ArgList &Args) - : ToolChain(D, Triple, Args), HostTC(HostTC), - CudaInstallation(D, HostTC.getTriple(), Args) { +/// NVPTX toolchain. Our assembler is ptxas, and our linker is nvlink. This +/// operates as a stand-alone version of the NVPTX tools without the host +/// toolchain. +NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple, + const llvm::Triple &HostTriple, + const ArgList &Args) + : ToolChain(D, Triple, Args), CudaInstallation(D, HostTriple, Args) { if (CudaInstallation.isValid()) { CudaInstallation.WarnIfUnsupportedVersion(); getProgramPaths().push_back(std::string(CudaInstallation.getBinPath())); @@ -605,19 +711,70 @@ getProgramPaths().push_back(getDriver().Dir); } -std::string CudaToolChain::getInputFilename(const InputInfo &Input) const { - // Only object files are changed, for example assembly files keep their .s - // extensions. If the user requested device-only compilation don't change it. - if (Input.getType() != types::TY_Object || getDriver().offloadDeviceOnly()) - return ToolChain::getInputFilename(Input); +/// We only need the host triple to locate the CUDA binary utilities, use the +/// system's default triple if not provided. +NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple, + const ArgList &Args) + : NVPTXToolChain(D, Triple, + llvm::Triple(llvm::sys::getDefaultTargetTriple()), Args) {} - // Replace extension for object files with cubin because nvlink relies on - // these particular file names. - SmallString<256> Filename(ToolChain::getInputFilename(Input)); - llvm::sys::path::replace_extension(Filename, "cubin"); - return std::string(Filename.str()); +llvm::opt::DerivedArgList * +NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, + StringRef BoundArch, + Action::OffloadKind DeviceOffloadKind) const { + DerivedArgList *DAL = + ToolChain::TranslateArgs(Args, BoundArch, DeviceOffloadKind); + if (!DAL) + DAL = new DerivedArgList(Args.getBaseArgs()); + + const OptTable &Opts = getDriver().getOpts(); + + for (Arg *A : Args) + if (!llvm::is_contained(*DAL, A)) + DAL->append(A); + + if (!DAL->hasArg(options::OPT_march_EQ)) + DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), + CudaArchToString(CudaArch::CudaDefault)); + + return DAL; } +bool NVPTXToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const { + const Option &O = A->getOption(); + return (O.matches(options::OPT_gN_Group) && + !O.matches(options::OPT_gmodules)) || + O.matches(options::OPT_g_Flag) || + O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) || + O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) || + O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) || + O.matches(options::OPT_gdwarf_5) || + O.matches(options::OPT_gcolumn_info); +} + +void NVPTXToolChain::adjustDebugInfoKind( + codegenoptions::DebugInfoKind &DebugInfoKind, const ArgList &Args) const { + switch (mustEmitDebugInfo(Args)) { + case DisableDebugInfo: + DebugInfoKind = codegenoptions::NoDebugInfo; + break; + case DebugDirectivesOnly: + DebugInfoKind = codegenoptions::DebugDirectivesOnly; + break; + case EmitSameDebugInfoAsHost: + // Use same debug info level as the host. + break; + } +} + +/// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary, +/// which isn't properly a linker but nonetheless performs the step of stitching +/// together object files from the assembler into a single blob. + +CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple, + const ToolChain &HostTC, const ArgList &Args) + : NVPTXToolChain(D, Triple, HostTC.getTriple(), Args), HostTC(HostTC) {} + void CudaToolChain::addClangTargetOptions( const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, Action::OffloadKind DeviceOffloadingKind) const { @@ -696,33 +853,6 @@ return llvm::DenormalMode::getIEEE(); } -bool CudaToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const { - const Option &O = A->getOption(); - return (O.matches(options::OPT_gN_Group) && - !O.matches(options::OPT_gmodules)) || - O.matches(options::OPT_g_Flag) || - O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) || - O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) || - O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) || - O.matches(options::OPT_gdwarf_5) || - O.matches(options::OPT_gcolumn_info); -} - -void CudaToolChain::adjustDebugInfoKind( - codegenoptions::DebugInfoKind &DebugInfoKind, const ArgList &Args) const { - switch (mustEmitDebugInfo(Args)) { - case DisableDebugInfo: - DebugInfoKind = codegenoptions::NoDebugInfo; - break; - case DebugDirectivesOnly: - DebugInfoKind = codegenoptions::DebugDirectivesOnly; - break; - case EmitSameDebugInfoAsHost: - // Use same debug info level as the host. - break; - } -} - void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { // Check our CUDA version if we're going to include the CUDA headers. @@ -735,6 +865,19 @@ CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args); } +std::string CudaToolChain::getInputFilename(const InputInfo &Input) const { + // Only object files are changed, for example assembly files keep their .s + // extensions. If the user requested device-only compilation don't change it. + if (Input.getType() != types::TY_Object || getDriver().offloadDeviceOnly()) + return ToolChain::getInputFilename(Input); + + // Replace extension for object files with cubin because nvlink relies on + // these particular file names. + SmallString<256> Filename(ToolChain::getInputFilename(Input)); + llvm::sys::path::replace_extension(Filename, "cubin"); + return std::string(Filename.str()); +} + llvm::opt::DerivedArgList * CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch, @@ -811,12 +954,20 @@ return std::move(GPUArchs); } +Tool *NVPTXToolChain::buildAssembler() const { + return new tools::NVPTX::Assembler(*this); +} + +Tool *NVPTXToolChain::buildLinker() const { + return new tools::NVPTX::Linker(*this); +} + Tool *CudaToolChain::buildAssembler() const { return new tools::NVPTX::Assembler(*this); } Tool *CudaToolChain::buildLinker() const { - return new tools::NVPTX::Linker(*this); + return new tools::NVPTX::FatBinary(*this); } void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const { diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c new file mode 100644 --- /dev/null +++ b/clang/test/Driver/cuda-cross-compiling.c @@ -0,0 +1,68 @@ +// Tests the driver when targeting the NVPTX architecture directly without a +// host toolchain to perform CUDA mappings. + +// REQUIRES: nvptx-registered-target + +// +// Test the generated phases when targeting NVPTX. +// +// RUN: %clang -target nvptx64-nvidia-cuda -ccc-print-phases %s 2>&1 \ +// RUN: | FileCheck -check-prefix=PHASES %s + +// PHASES: 0: input, "[[INPUT:.+]]", c +// PHASES-NEXT: 1: preprocessor, {0}, cpp-output +// PHASES-NEXT: 2: compiler, {1}, ir +// PHASES-NEXT: 3: backend, {2}, assembler +// PHASES-NEXT: 4: assembler, {3}, object +// PHASES-NEXT: 5: linker, {4}, image + +// +// Test the generated bindings when targeting NVPTX. +// +// RUN: %clang -target nvptx64-nvidia-cuda -ccc-print-bindings %s 2>&1 \ +// RUN: | FileCheck -check-prefix=BINDINGS %s + +// BINDINGS: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX:.+]].s" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX]].s"], output: "[[CUBIN:.+]].o" +// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Linker", inputs: ["[[CUBIN]].o"], output: "a.out" + +// +// Test the generated arguments to the CUDA binary utils when targeting NVPTX. +// Ensure that the '.o' files are converted to '.cubin' if produced internally. +// +// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -### %s 2>&1 \ +// RUN: | FileCheck -check-prefix=ARGS %s + +// ARGS: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_61" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s" +// ARGS-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_61" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c" +// ARGS-NEXT: nvlink{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "[[CUBIN]].cubin" + +// +// Test the generated arguments to the CUDA binary utils when targeting NVPTX. +// Ensure that we emit '.o' files if compiled with '-c' +// +// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -c -### %s 2>&1 \ +// RUN: | FileCheck -check-prefix=OBJECT %s + +// OBJECT: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_61" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s" +// OBJECT-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_61" "--output-file" "[[OBJ:.+]].o" "[[PTX]].s" "-c" + +// +// Test the generated arguments to the CUDA binary utils when targeting NVPTX. +// Ensure that we copy input '.o' files to '.cubin' files when linking. +// +// RUN: touch %t.o +// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -### %t.o 2>&1 \ +// RUN: | FileCheck -check-prefix=LINK %s + +// LINK: nvlink{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "{{.*}}.cubin" + +// +// Test the generated arguments default to a value with no architecture. +// +// RUN: %clang -target nvptx64-nvidia-cuda -### %s 2>&1 \ +// RUN: | FileCheck -check-prefix=DEFAULT %s + +// DEFAULT: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_35" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s" +// DEFAULT-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_35" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c" +// DEFAULT-NEXT: nvlink{{.*}}"-o" "a.out" "-arch" "sm_35" {{.*}} "[[CUBIN]].cubin"