Index: cfe/trunk/include/clang/Driver/Action.h =================================================================== --- cfe/trunk/include/clang/Driver/Action.h +++ cfe/trunk/include/clang/Driver/Action.h @@ -136,7 +136,8 @@ class CudaDeviceAction : public Action { virtual void anchor(); - /// GPU architecture to bind. Always of the form /sm_\d+/. + /// GPU architecture to bind. Always of the form /sm_\d+/ or null (when the + /// action applies to multiple architectures). const char *GpuArchName; /// True when action results are not consumed by the host action (e.g when /// -fsyntax-only or --cuda-device-only options are used). @@ -147,7 +148,8 @@ const char *getGpuArchName() const { return GpuArchName; } - /// Gets the compute_XX that corresponds to getGpuArchName(). + /// Gets the compute_XX that corresponds to getGpuArchName(). Returns null + /// when getGpuArchName() is null. const char *getComputeArchName() const; bool isAtTopLevel() const { return AtTopLevel; } Index: cfe/trunk/include/clang/Driver/Options.td =================================================================== --- cfe/trunk/include/clang/Driver/Options.td +++ cfe/trunk/include/clang/Driver/Options.td @@ -336,6 +336,10 @@ def Xclang : Separate<["-"], "Xclang">, HelpText<"Pass to the clang compiler">, MetaVarName<"">, Flags<[DriverOption, CoreOption]>; +def Xcuda_fatbinary : Separate<["-"], "Xcuda-fatbinary">, + HelpText<"Pass to fatbinary invocation">, MetaVarName<"">; +def Xcuda_ptxas : Separate<["-"], "Xcuda-ptxas">, + HelpText<"Pass to the ptxas assembler">, MetaVarName<"">; def z : Separate<["-"], "z">, Flags<[LinkerInput, RenderAsInput]>, HelpText<"Pass -z to the linker">, MetaVarName<"">; def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>, Index: cfe/trunk/include/clang/Driver/ToolChain.h =================================================================== --- cfe/trunk/include/clang/Driver/ToolChain.h +++ cfe/trunk/include/clang/Driver/ToolChain.h @@ -228,7 +228,7 @@ virtual bool IsIntegratedAssemblerDefault() const { return false; } /// \brief Check if the toolchain should use the integrated assembler. - bool useIntegratedAs() const; + virtual bool useIntegratedAs() const; /// IsMathErrnoDefault - Does this tool chain use -fmath-errno by default. virtual bool IsMathErrnoDefault() const { return true; } Index: cfe/trunk/include/clang/Driver/Types.def =================================================================== --- cfe/trunk/include/clang/Driver/Types.def +++ cfe/trunk/include/clang/Driver/Types.def @@ -93,4 +93,5 @@ TYPE("image", Image, INVALID, "out", "") TYPE("dSYM", dSYM, INVALID, "dSYM", "A") TYPE("dependencies", Dependencies, INVALID, "d", "") +TYPE("cuda-fatbin", CUDA_FATBIN, INVALID, "fatbin","A") TYPE("none", Nothing, INVALID, nullptr, "u") Index: cfe/trunk/lib/CodeGen/CGCUDANV.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGCUDANV.cpp +++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp @@ -259,6 +259,8 @@ TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage, llvm::ConstantStruct::get(FatbinWrapperTy, Values), "__cuda_fatbin_wrapper"); + // NVIDIA's cuobjdump looks for fatbins in this section. + FatbinWrapper->setSection(".nvFatBinSegment"); // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( Index: cfe/trunk/lib/Driver/Action.cpp =================================================================== --- cfe/trunk/lib/Driver/Action.cpp +++ cfe/trunk/lib/Driver/Action.cpp @@ -75,7 +75,7 @@ bool AtTopLevel) : Action(CudaDeviceClass, Input), GpuArchName(ArchName), AtTopLevel(AtTopLevel) { - assert(IsValidGpuArchName(GpuArchName)); + assert(!GpuArchName || IsValidGpuArchName(GpuArchName)); } const char *CudaDeviceAction::getComputeArchName() const { Index: cfe/trunk/lib/Driver/Driver.cpp =================================================================== --- cfe/trunk/lib/Driver/Driver.cpp +++ cfe/trunk/lib/Driver/Driver.cpp @@ -949,8 +949,9 @@ os << '"' << BIA->getArchName() << '"' << ", {" << PrintActions1(C, *BIA->begin(), Ids) << "}"; } else if (CudaDeviceAction *CDA = dyn_cast(A)) { - os << '"' << CDA->getGpuArchName() << '"' << ", {" - << PrintActions1(C, *CDA->begin(), Ids) << "}"; + os << '"' + << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)") + << '"' << ", {" << PrintActions1(C, *CDA->begin(), Ids) << "}"; } else { const ActionList *AL; if (CudaHostAction *CHA = dyn_cast(A)) { @@ -1327,7 +1328,7 @@ // Check whether any of device actions stopped before they could generate PTX. bool PartialCompilation = llvm::any_of(CudaDeviceActions, [](const Action *a) { - return a->getKind() != Action::BackendJobClass; + return a->getKind() != Action::AssembleJobClass; }); // Figure out what to do with device actions -- pass them as inputs to the @@ -1356,16 +1357,32 @@ return HostAction; } - // Outputs of device actions during complete CUDA compilation get created - // with AtTopLevel=false and become inputs for the host action. + // If we're not a partial or device-only compilation, we compile each arch to + // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device + // "link" action, which uses fatbinary to combine these cubins into one + // fatbin. The fatbin is then an input to the host compilation. ActionList DeviceActions; - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) - DeviceActions.push_back( - C.MakeAction(CudaDeviceActions[I], GpuArchList[I], - /* AtTopLevel */ false)); + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + Action* AssembleAction = CudaDeviceActions[I]; + assert(AssembleAction->getType() == types::TY_Object); + assert(AssembleAction->getInputs().size() == 1); + + Action* BackendAction = AssembleAction->getInputs()[0]; + assert(BackendAction->getType() == types::TY_PP_Asm); + + for (const auto& A : {AssembleAction, BackendAction}) { + DeviceActions.push_back(C.MakeAction( + A, GpuArchList[I], /* AtTopLevel */ false)); + } + } + auto FatbinAction = C.MakeAction( + C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN), + /* GpuArchName = */ nullptr, + /* AtTopLevel = */ false); // Return a new host action that incorporates original host action and all // device actions. - return C.MakeAction(HostAction, DeviceActions); + return C.MakeAction(std::move(HostAction), + ActionList({FatbinAction})); } void Driver::BuildActions(Compilation &C, const ToolChain &TC, @@ -1600,7 +1617,7 @@ return C.MakeAction(Input, types::TY_PP_Asm); } case phases::Assemble: - return C.MakeAction(Input, types::TY_Object); + return C.MakeAction(std::move(Input), types::TY_Object); } llvm_unreachable("invalid phase in ConstructPhaseAction"); @@ -1849,11 +1866,14 @@ if (const CudaDeviceAction *CDA = dyn_cast(A)) { // Initial processing of CudaDeviceAction carries host params. // Call BuildJobsForAction() again, now with correct device parameters. - assert(CDA->getGpuArchName() && "No GPU name in device action."); - return BuildJobsForAction(C, *CDA->begin(), C.getCudaDeviceToolChain(), - CDA->getGpuArchName(), CDA->isAtTopLevel(), - /*MultipleArchs*/ true, LinkingOutput, - CachedResults); + InputInfo II = BuildJobsForAction( + C, *CDA->begin(), C.getCudaDeviceToolChain(), CDA->getGpuArchName(), + CDA->isAtTopLevel(), /*MultipleArchs*/ true, LinkingOutput, + CachedResults); + // Currently II's Action is *CDA->begin(). Set it to CDA instead, so that + // one can retrieve II's GPU arch. + II.setAction(A); + return II; } const ActionList *Inputs = &A->getInputs(); Index: cfe/trunk/lib/Driver/ToolChains.h =================================================================== --- cfe/trunk/lib/Driver/ToolChains.h +++ cfe/trunk/lib/Driver/ToolChains.h @@ -163,6 +163,7 @@ bool IsValid; const Driver &D; std::string CudaInstallPath; + std::string CudaBinPath; std::string CudaLibPath; std::string CudaLibDevicePath; std::string CudaIncludePath; @@ -179,6 +180,8 @@ /// \brief Get the detected Cuda installation path. StringRef getInstallPath() const { return CudaInstallPath; } + /// \brief Get the detected path to Cuda's bin directory. + StringRef getBinPath() const { return CudaBinPath; } /// \brief Get the detected Cuda Include path. StringRef getIncludePath() const { return CudaIncludePath; } /// \brief Get the detected Cuda library path. @@ -816,6 +819,14 @@ const char *BoundArch) const override; void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; + + // Never try to use the integrated assembler with CUDA; always fork out to + // ptxas. + bool useIntegratedAs() const override { return false; } + +protected: + Tool *buildAssembler() const override; // ptxas + Tool *buildLinker() const override; // fatbinary (ok, not really a linker) }; class LLVM_LIBRARY_VISIBILITY MipsLLVMToolChain : public Linux { Index: cfe/trunk/lib/Driver/ToolChains.cpp =================================================================== --- cfe/trunk/lib/Driver/ToolChains.cpp +++ cfe/trunk/lib/Driver/ToolChains.cpp @@ -1652,13 +1652,14 @@ continue; CudaInstallPath = CudaPath; + CudaBinPath = CudaPath + "/bin"; CudaIncludePath = CudaInstallPath + "/include"; CudaLibDevicePath = CudaInstallPath + "/nvvm/libdevice"; CudaLibPath = CudaInstallPath + (TargetTriple.isArch64Bit() ? "/lib64" : "/lib"); if (!(D.getVFS().exists(CudaIncludePath) && - D.getVFS().exists(CudaLibPath) && + D.getVFS().exists(CudaBinPath) && D.getVFS().exists(CudaLibPath) && D.getVFS().exists(CudaLibDevicePath))) continue; @@ -4182,13 +4183,16 @@ return new tools::dragonfly::Linker(*this); } -/// Stub for CUDA toolchain. At the moment we don't have assembler or -/// linker and need toolchain mainly to propagate device-side options -/// to CC1. +/// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary, +/// which isn't properly a linker but nonetheless performs the step of stitching +/// together object files from the assembler into a single blob. CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) - : Linux(D, Triple, Args) {} + : Linux(D, Triple, Args) { + if (CudaInstallation.isValid()) + getProgramPaths().push_back(CudaInstallation.getBinPath()); +} void CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, @@ -4222,7 +4226,7 @@ for (Arg *A : Args) { if (A->getOption().matches(options::OPT_Xarch__)) { // Skip this argument unless the architecture matches BoundArch - if (A->getValue(0) != StringRef(BoundArch)) + if (!BoundArch || A->getValue(0) != StringRef(BoundArch)) continue; unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1)); @@ -4253,10 +4257,19 @@ DAL->append(A); } - DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch); + if (BoundArch) + DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch); return DAL; } +Tool *CudaToolChain::buildAssembler() const { + return new tools::NVPTX::Assembler(*this); +} + +Tool *CudaToolChain::buildLinker() const { + return new tools::NVPTX::Linker(*this); +} + /// XCore tool chain XCoreToolChain::XCoreToolChain(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) Index: cfe/trunk/lib/Driver/Tools.h =================================================================== --- cfe/trunk/lib/Driver/Tools.h +++ cfe/trunk/lib/Driver/Tools.h @@ -903,6 +903,41 @@ }; } // end namespace PS4cpu +namespace NVPTX { + +// Run ptxas, the NVPTX assembler. +class LLVM_LIBRARY_VISIBILITY Assembler : public Tool { + public: + Assembler(const ToolChain &TC) + : Tool("NVPTX::Assembler", "ptxas", TC, RF_Full, llvm::sys::WEM_UTF8, + "--options-file") {} + + bool hasIntegratedCPP() const override { return false; } + + void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; + +// Runs fatbinary, which combines GPU object files ("cubin" files) and/or PTX +// assembly into a single output file. +class LLVM_LIBRARY_VISIBILITY Linker : public Tool { + public: + Linker(const ToolChain &TC) + : Tool("NVPTX::Linker", "fatbinary", TC, RF_Full, llvm::sys::WEM_UTF8, + "--options-file") {} + + bool hasIntegratedCPP() const override { return false; } + + void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; + +} // end namespace NVPTX + } // end namespace tools } // end namespace driver } // end namespace clang Index: cfe/trunk/lib/Driver/Tools.cpp =================================================================== --- cfe/trunk/lib/Driver/Tools.cpp +++ cfe/trunk/lib/Driver/Tools.cpp @@ -10625,3 +10625,81 @@ else ConstructGoldLinkJob(*this, C, JA, Output, Inputs, Args, LinkingOutput); } + +void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + const auto &TC = + static_cast(getToolChain()); + assert(TC.getArch() == llvm::Triple::nvptx || + TC.getArch() == llvm::Triple::nvptx64); + + std::vector gpu_archs = + Args.getAllArgValues(options::OPT_march_EQ); + assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas."); + const std::string& gpu_arch = gpu_archs[0]; + + + ArgStringList CmdArgs; + CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32"); + + // Clang's default optimization level is -O0, but ptxas's default is -O3. + CmdArgs.push_back(Args.MakeArgString( + llvm::Twine("-O") + + Args.getLastArgValue(options::OPT_O_Group, "0").data())); + + // Don't bother passing -g to ptxas: It's enabled by default at -O0, and + // not supported at other optimization levels. + + CmdArgs.push_back("--gpu-name"); + CmdArgs.push_back(Args.MakeArgString(gpu_arch)); + CmdArgs.push_back("--output-file"); + CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); + for (const auto& II : Inputs) + CmdArgs.push_back(Args.MakeArgString(II.getFilename())); + + for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas)) + CmdArgs.push_back(Args.MakeArgString(A)); + + const char *Exec = Args.MakeArgString(TC.GetProgramPath("ptxas")); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); +} + +// All inputs to this linker must be from CudaDeviceActions, as we need to look +// at the Inputs' Actions in order to figure out which GPU architecture they +// correspond to. +void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + const auto &TC = + static_cast(getToolChain()); + assert(TC.getArch() == llvm::Triple::nvptx || + TC.getArch() == llvm::Triple::nvptx64); + + ArgStringList CmdArgs; + CmdArgs.push_back("--cuda"); + CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32"); + CmdArgs.push_back(Args.MakeArgString("--create")); + CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); + + for (const auto& II : Inputs) { + auto* A = cast(II.getAction()); + // We need to pass an Arch of the form "sm_XX" for cubin files and + // "compute_XX" for ptx. + const char *Arch = (II.getType() == types::TY_PP_Asm) + ? A->getComputeArchName() + : A->getGpuArchName(); + CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + + Arch + ",file=" + II.getFilename())); + } + + for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary)) + CmdArgs.push_back(Args.MakeArgString(A)); + + const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary")); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); +} Index: cfe/trunk/lib/Driver/Types.cpp =================================================================== --- cfe/trunk/lib/Driver/Types.cpp +++ cfe/trunk/lib/Driver/Types.cpp @@ -232,8 +232,7 @@ P.push_back(phases::Compile); P.push_back(phases::Backend); } - if (Id != TY_CUDA_DEVICE) - P.push_back(phases::Assemble); + P.push_back(phases::Assemble); } } Index: cfe/trunk/test/Driver/cuda-arch-translation.cu =================================================================== --- cfe/trunk/test/Driver/cuda-arch-translation.cu +++ cfe/trunk/test/Driver/cuda-arch-translation.cu @@ -0,0 +1,37 @@ +// Tests that "sm_XX" gets correctly converted to "compute_YY" when we invoke +// fatbinary. +// +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: nvptx-registered-target + +// CHECK:fatbinary + +// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 %s +// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_21 %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM21 %s +// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_30 %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM30 %s +// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_32 %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM32 %s +// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_35 %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s +// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_37 %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM37 %s +// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_50 %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM50 %s +// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_52 %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM52 %s +// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_53 %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM53 %s + +// SM20:--image=profile=sm_20{{.*}}--image=profile=compute_20 +// SM21:--image=profile=sm_21{{.*}}--image=profile=compute_20 +// SM30:--image=profile=sm_30{{.*}}--image=profile=compute_30 +// SM32:--image=profile=sm_32{{.*}}--image=profile=compute_32 +// SM35:--image=profile=sm_35{{.*}}--image=profile=compute_35 +// SM37:--image=profile=sm_37{{.*}}--image=profile=compute_37 +// SM50:--image=profile=sm_50{{.*}}--image=profile=compute_50 +// SM52:--image=profile=sm_52{{.*}}--image=profile=compute_52 +// SM53:--image=profile=sm_53{{.*}}--image=profile=compute_53 Index: cfe/trunk/test/Driver/cuda-external-tools.cu =================================================================== --- cfe/trunk/test/Driver/cuda-external-tools.cu +++ cfe/trunk/test/Driver/cuda-external-tools.cu @@ -0,0 +1,70 @@ +// Tests that ptxas and fatbinary are correctly during CUDA compilation. +// +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: nvptx-registered-target + +// Regular compile with -O2. +// RUN: %clang -### -target x86_64-linux-gnu -O2 -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s + +// Regular compile without -O. This should result in us passing -O0 to ptxas. +// RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s + +// Regular compile targeting sm_35. +// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s + +// 32-bit compile. +// RUN: %clang -### -target x86_32-linux-gnu -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH32 -check-prefix SM20 %s + +// Compile with -fintegrated-as. This should still cause us to invoke ptxas. +// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s + +// Check -Xcuda-ptxas and -Xcuda-fatbinary +// RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \ +// RUN: -Xcuda-fatbinary -bar1 -Xcuda-ptxas -foo2 -Xcuda-fatbinary -bar2 %s 2>&1 \ +// RUN: | FileCheck -check-prefix SM20 -check-prefix PTXAS-EXTRA \ +// RUN: -check-prefix FATBINARY-EXTRA %s + +// Match clang job that produces PTX assembly. +// CHECK: "-cc1" "-triple" "nvptx64-nvidia-cuda" +// SM20: "-target-cpu" "sm_20" +// SM35: "-target-cpu" "sm_35" +// SM20: "-o" "[[PTXFILE:[^"]*]]" +// SM35: "-o" "[[PTXFILE:[^"]*]]" + +// Match the call to ptxas (which assembles PTX to SASS). +// CHECK:ptxas +// ARCH64: "-m64" +// ARCH32: "-m32" +// OPT0: "-O0" +// OPT2: "-O2" +// SM20: "--gpu-name" "sm_20" +// SM35: "--gpu-name" "sm_35" +// SM20: "--output-file" "[[CUBINFILE:[^"]*]]" +// SM35: "--output-file" "[[CUBINFILE:[^"]*]]" +// PTXAS-EXTRA: "-foo1" +// PTXAS-EXTRA-SAME: "-foo2" +// CHECK-SAME: "[[PTXFILE]]" + +// Match the call to fatbinary (which combines all our PTX and SASS into one +// blob). +// CHECK:fatbinary +// CHECK-DAG: "--cuda" +// ARCH64-DAG: "-64" +// ARCH32-DAG: "-32" +// CHECK-DAG: "--create" "[[FATBINARY:[^"]*]]" +// SM20-DAG: "--image=profile=compute_20,file=[[PTXFILE]]" +// SM35-DAG: "--image=profile=compute_35,file=[[PTXFILE]]" +// SM20-DAG: "--image=profile=sm_20,file=[[CUBINFILE]]" +// SM35-DAG: "--image=profile=sm_35,file=[[CUBINFILE]]" +// FATBINARY-EXTRA: "-bar1" +// FATBINARY-EXTRA-SAME: "-bar2" + +// Match the clang job for host compilation. +// CHECK: "-cc1" "-triple" "x86_64--linux-gnu" +// CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]" Index: cfe/trunk/test/Driver/cuda-options.cu =================================================================== --- cfe/trunk/test/Driver/cuda-options.cu +++ cfe/trunk/test/Driver/cuda-options.cu @@ -39,13 +39,6 @@ // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \ // RUN: -check-prefix NOHOST -check-prefix NOLINK %s -// Verify that with -S we compile host and device sides to assembly and -// incorporate device code into the host side. -// RUN: %clang -### -target x86_64-linux-gnu -S -c %s 2>&1 \ -// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \ -// RUN: -check-prefix HOST -check-prefix INCLUDES-DEVICE \ -// RUN: -check-prefix NOLINK %s - // Verify that --cuda-gpu-arch option passes the correct GPU archtecture to // device compilation. // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \ @@ -61,7 +54,7 @@ // RUN: -check-prefix DEVICE2 -check-prefix DEVICE-SM35 \ // RUN: -check-prefix DEVICE2-SM30 -check-prefix HOST \ // RUN: -check-prefix HOST-NOSAVE -check-prefix INCLUDES-DEVICE \ -// RUN: -check-prefix INCLUDES-DEVICE2 -check-prefix NOLINK %s +// RUN: -check-prefix NOLINK %s // Verify that device-side results are passed to the correct tool when // -save-temps is used. @@ -92,10 +85,16 @@ // DEVICE-NOSAVE-SAME: "-aux-triple" "x86_64--linux-gnu" // DEVICE-SAME: "-fcuda-is-device" // DEVICE-SM35-SAME: "-target-cpu" "sm_35" -// DEVICE-SAME: "-o" "[[GPUBINARY1:[^"]*]]" +// DEVICE-SAME: "-o" "[[PTXFILE:[^"]*]]" // DEVICE-NOSAVE-SAME: "-x" "cuda" // DEVICE-SAVE-SAME: "-x" "ir" +// Match the call to ptxas (which assembles PTX to SASS). +// DEVICE:ptxas +// DEVICE-SM35-DAG: "--gpu-name" "sm_35" +// DEVICE-DAG: "--output-file" "[[CUBINFILE:[^"]*]]" +// DEVICE-DAG: "[[PTXFILE]]" + // Match another device-side compilation. // DEVICE2: "-cc1" "-triple" "nvptx64-nvidia-cuda" // DEVICE2-SAME: "-aux-triple" "x86_64--linux-gnu" @@ -108,6 +107,11 @@ // NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda" // NODEVICE-SAME-NOT: "-fcuda-is-device" +// INCLUDES-DEVICE:fatbinary +// INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]" +// INCLUDES-DEVICE-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE]]" +// INCLUDES-DEVICE-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE]]" + // Match host-side preprocessor job with -save-temps. // HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu" // HOST-SAVE-SAME: "-aux-triple" "nvptx64-nvidia-cuda" @@ -121,8 +125,7 @@ // HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]" // HOST-NOSAVE-SAME: "-x" "cuda" // HOST-SAVE-SAME: "-x" "cuda-cpp-output" -// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]" -// INCLUDES-DEVICE2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]" +// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]" // Match external assembler that uses compilation output. // HOST-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"