Index: include/clang/Driver/Action.h =================================================================== --- include/clang/Driver/Action.h +++ include/clang/Driver/Action.h @@ -139,17 +139,15 @@ virtual void anchor(); /// GPU architecture to bind -- e.g 'sm_35'. const char *GpuArchName; - const char *DeviceTriple; /// True when action results are not consumed by the host action (e.g when /// -fsyntax-only or --cuda-device-only options are used). bool AtTopLevel; public: CudaDeviceAction(std::unique_ptr Input, const char *ArchName, - const char *DeviceTriple, bool AtTopLevel); + bool AtTopLevel); const char *getGpuArchName() const { return GpuArchName; } - const char *getDeviceTriple() const { return DeviceTriple; } bool isAtTopLevel() const { return AtTopLevel; } static bool classof(const Action *A) { @@ -160,16 +158,13 @@ class CudaHostAction : public Action { virtual void anchor(); ActionList DeviceActions; - const char *DeviceTriple; public: - CudaHostAction(std::unique_ptr Input, const ActionList &DeviceActions, - const char *DeviceTriple); + CudaHostAction(std::unique_ptr Input, + const ActionList &DeviceActions); ~CudaHostAction() override; - ActionList &getDeviceActions() { return DeviceActions; } const ActionList &getDeviceActions() const { return DeviceActions; } - const char *getDeviceTriple() const { return DeviceTriple; } static bool classof(const Action *A) { return A->getKind() == CudaHostClass; } }; Index: include/clang/Driver/Compilation.h =================================================================== --- include/clang/Driver/Compilation.h +++ include/clang/Driver/Compilation.h @@ -38,6 +38,9 @@ /// The default tool chain. const ToolChain &DefaultToolChain; + const ToolChain *CudaHostToolChain; + const ToolChain *CudaDeviceToolChain; + /// The original (untranslated) input argument list. llvm::opt::InputArgList *Args; @@ -81,6 +84,17 @@ const Driver &getDriver() const { return TheDriver; } const ToolChain &getDefaultToolChain() const { return DefaultToolChain; } + const ToolChain *getCudaHostToolChain() const { return CudaHostToolChain; } + const ToolChain *getCudaDeviceToolChain() const { + return CudaDeviceToolChain; + } + + void setCudaHostToolChain(const ToolChain *HostToolChain) { + CudaHostToolChain = HostToolChain; + } + void setCudaDeviceToolChain(const ToolChain *DeviceToolChain) { + CudaDeviceToolChain = DeviceToolChain; + } const llvm::opt::InputArgList &getInputArgs() const { return *Args; } Index: include/clang/Driver/Driver.h =================================================================== --- include/clang/Driver/Driver.h +++ include/clang/Driver/Driver.h @@ -297,22 +297,23 @@ /// BuildActions - Construct the list of actions to perform for the /// given arguments, which are only done for a single architecture. /// + /// \param C - The compilation that is being built. /// \param TC - The default host tool chain. /// \param Args - The input arguments. /// \param Actions - The list to store the resulting actions onto. - void BuildActions(const ToolChain &TC, llvm::opt::DerivedArgList &Args, - const InputList &Inputs, ActionList &Actions) const; + void BuildActions(Compilation &C, const ToolChain &TC, + llvm::opt::DerivedArgList &Args, const InputList &Inputs, + ActionList &Actions) const; /// BuildUniversalActions - Construct the list of actions to perform /// for the given arguments, which may require a universal build. /// + /// \param C - The compilation that is being built. /// \param TC - The default host tool chain. /// \param Args - The input arguments. /// \param Actions - The list to store the resulting actions onto. - void BuildUniversalActions(const ToolChain &TC, - llvm::opt::DerivedArgList &Args, - const InputList &BAInputs, - ActionList &Actions) const; + void BuildUniversalActions(Compilation &C, const ToolChain &TC, + const InputList &BAInputs) const; /// BuildJobs - Bind actions to concrete tools and translate /// arguments to form the list of jobs to run. @@ -433,13 +434,6 @@ /// compilation based on which -f(no-)?lto(=.*)? option occurs last. void setLTOMode(const llvm::opt::ArgList &Args); - /// \brief Retrieves a ToolChain for a particular \p Target triple. - /// - /// Will cache ToolChains for the life of the driver object, and create them - /// on-demand. - const ToolChain &getToolChain(const llvm::opt::ArgList &Args, - const llvm::Triple &Target) const; - /// @} /// \brief Get bitmasks for which option flags to include and exclude based on @@ -447,6 +441,13 @@ std::pair getIncludeExcludeOptionFlagMasks() const; public: + /// \brief Retrieves a ToolChain for a particular \p Target triple. + /// + /// Will cache ToolChains for the life of the driver object, and create them + /// on-demand. + const ToolChain &getToolChain(const llvm::opt::ArgList &Args, + const llvm::Triple &Target) const; + /// GetReleaseVersion - Parse (([0-9]+)(.([0-9]+)(.([0-9]+)?))?)? and /// return the grouped values as integers. Numbers which are not /// provided are set to 0. Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -1592,6 +1592,8 @@ def no__dead__strip__inits__and__terms : Flag<["-"], "no_dead_strip_inits_and_terms">; def nobuiltininc : Flag<["-"], "nobuiltininc">, Flags<[CC1Option]>, HelpText<"Disable builtin #include directories">; +def nocudainc : Flag<["-"], "nocudainc">; +def nocudalib : Flag<["-"], "nocudalib">; def nodefaultlibs : Flag<["-"], "nodefaultlibs">; def nofixprebinding : Flag<["-"], "nofixprebinding">; def nolibc : Flag<["-"], "nolibc">; Index: include/clang/Driver/ToolChain.h =================================================================== --- include/clang/Driver/ToolChain.h +++ include/clang/Driver/ToolChain.h @@ -393,6 +393,10 @@ virtual void addProfileRTLibs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const; + /// \brief Add arguments to use system-specific CUDA includes. + virtual void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const; + /// \brief Return sanitizers which are available in this toolchain. virtual SanitizerMask getSupportedSanitizers() const; }; Index: lib/Driver/Action.cpp =================================================================== --- lib/Driver/Action.cpp +++ lib/Driver/Action.cpp @@ -58,18 +58,15 @@ void CudaDeviceAction::anchor() {} CudaDeviceAction::CudaDeviceAction(std::unique_ptr Input, - const char *ArchName, - const char *DeviceTriple, bool AtTopLevel) + const char *ArchName, bool AtTopLevel) : Action(CudaDeviceClass, std::move(Input)), GpuArchName(ArchName), - DeviceTriple(DeviceTriple), AtTopLevel(AtTopLevel) {} + AtTopLevel(AtTopLevel) {} void CudaHostAction::anchor() {} CudaHostAction::CudaHostAction(std::unique_ptr Input, - const ActionList &DeviceActions, - const char *DeviceTriple) - : Action(CudaHostClass, std::move(Input)), DeviceActions(DeviceActions), - DeviceTriple(DeviceTriple) {} + const ActionList &DeviceActions) + : Action(CudaHostClass, std::move(Input)), DeviceActions(DeviceActions) {} CudaHostAction::~CudaHostAction() { for (auto &DA : DeviceActions) Index: lib/Driver/Compilation.cpp =================================================================== --- lib/Driver/Compilation.cpp +++ lib/Driver/Compilation.cpp @@ -24,8 +24,9 @@ Compilation::Compilation(const Driver &D, const ToolChain &_DefaultToolChain, InputArgList *_Args, DerivedArgList *_TranslatedArgs) - : TheDriver(D), DefaultToolChain(_DefaultToolChain), Args(_Args), - TranslatedArgs(_TranslatedArgs), Redirects(nullptr), + : TheDriver(D), DefaultToolChain(_DefaultToolChain), + CudaHostToolChain(&DefaultToolChain), CudaDeviceToolChain(nullptr), + Args(_Args), TranslatedArgs(_TranslatedArgs), Redirects(nullptr), ForDiagnostics(false) {} Compilation::~Compilation() { Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -491,6 +491,10 @@ // The compilation takes ownership of Args. Compilation *C = new Compilation(*this, TC, UArgs.release(), TranslatedArgs); + C->setCudaDeviceToolChain( + &getToolChain(C->getArgs(), llvm::Triple(TC.getTriple().isArch64Bit() + ? "nvptx64-nvidia-cuda" + : "nvptx-nvidia-cuda"))); if (!HandleImmediateArgs(*C)) return C; @@ -501,10 +505,9 @@ // Construct the list of abstract actions to perform for this compilation. On // MachO targets this uses the driver-driver and universal actions. if (TC.getTriple().isOSBinFormatMachO()) - BuildUniversalActions(C->getDefaultToolChain(), C->getArgs(), Inputs, - C->getActions()); + BuildUniversalActions(*C, C->getDefaultToolChain(), Inputs); else - BuildActions(C->getDefaultToolChain(), C->getArgs(), Inputs, + BuildActions(*C, C->getDefaultToolChain(), C->getArgs(), Inputs, C->getActions()); if (CCCPrintPhases) { @@ -617,9 +620,9 @@ // Darwin OSes this uses the driver-driver and builds universal actions. const ToolChain &TC = C.getDefaultToolChain(); if (TC.getTriple().isOSBinFormatMachO()) - BuildUniversalActions(TC, C.getArgs(), Inputs, C.getActions()); + BuildUniversalActions(C, TC, Inputs); else - BuildActions(TC, C.getArgs(), Inputs, C.getActions()); + BuildActions(C, TC, C.getArgs(), Inputs, C.getActions()); BuildJobs(C); @@ -948,7 +951,7 @@ os << '"' << CDA->getGpuArchName() << '"' << ", {" << PrintActions1(C, *CDA->begin(), Ids) << "}"; } else { - ActionList *AL; + const ActionList *AL; if (CudaHostAction *CHA = dyn_cast(A)) { os << "{" << PrintActions1(C, *CHA->begin(), Ids) << "}" << ", gpu binaries "; @@ -997,9 +1000,10 @@ return false; } -void Driver::BuildUniversalActions(const ToolChain &TC, DerivedArgList &Args, - const InputList &BAInputs, - ActionList &Actions) const { +void Driver::BuildUniversalActions(Compilation &C, const ToolChain &TC, + const InputList &BAInputs) const { + DerivedArgList &Args = C.getArgs(); + ActionList &Actions = C.getActions(); llvm::PrettyStackTraceString CrashInfo("Building universal build actions"); // Collect the list of architectures. Duplicates are allowed, but should only // be handled once (in the order seen). @@ -1028,7 +1032,7 @@ Archs.push_back(Args.MakeArgString(TC.getDefaultUniversalArchName())); ActionList SingleActions; - BuildActions(TC, Args, BAInputs, SingleActions); + BuildActions(C, TC, Args, BAInputs, SingleActions); // Add in arch bindings for every top level action, as well as lipo and // dsymutil steps if needed. @@ -1279,21 +1283,15 @@ // and returns a new CudaHostAction which wraps /p Current and device // side actions. static std::unique_ptr -buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args, - const Arg *InputArg, std::unique_ptr HostAction, - ActionList &Actions) { - // Figure out which NVPTX triple to use for device-side compilation based on - // whether host is 64-bit. - const char *DeviceTriple = TC.getTriple().isArch64Bit() - ? "nvptx64-nvidia-cuda" - : "nvptx-nvidia-cuda"; +buildCudaActions(Compilation &C, DerivedArgList &Args, const Arg *InputArg, + std::unique_ptr HostAction, ActionList &Actions) { Arg *PartialCompilationArg = Args.getLastArg(options::OPT_cuda_host_only, options::OPT_cuda_device_only); // Host-only compilation case. if (PartialCompilationArg && PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only)) return std::unique_ptr( - new CudaHostAction(std::move(HostAction), {}, DeviceTriple)); + new CudaHostAction(std::move(HostAction), {})); // Collect all cuda_gpu_arch parameters, removing duplicates. SmallVector GpuArchList; @@ -1317,8 +1315,11 @@ CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg)); // Build actions for all device inputs. + assert(C.getCudaDeviceToolChain() && + "Missing toolchain for device-side compilation."); ActionList CudaDeviceActions; - D.BuildActions(TC, Args, CudaDeviceInputs, CudaDeviceActions); + C.getDriver().BuildActions(C, *C.getCudaDeviceToolChain(), Args, + CudaDeviceInputs, CudaDeviceActions); assert(GpuArchList.size() == CudaDeviceActions.size() && "Failed to create actions for all devices"); @@ -1342,14 +1343,15 @@ // -o is ambiguous if we have more than one top-level action. if (Args.hasArg(options::OPT_o) && (!DeviceOnlyCompilation || GpuArchList.size() > 1)) { - D.Diag(clang::diag::err_drv_output_argument_with_multiple_files); + C.getDriver().Diag( + clang::diag::err_drv_output_argument_with_multiple_files); return nullptr; } for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) Actions.push_back(new CudaDeviceAction( std::unique_ptr(CudaDeviceActions[I]), GpuArchList[I], - DeviceTriple, /* AtTopLevel */ true)); + /* AtTopLevel */ true)); // Kill host action in case of device-only compilation. if (DeviceOnlyCompilation) HostAction.reset(nullptr); @@ -1362,15 +1364,16 @@ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) DeviceActions.push_back(new CudaDeviceAction( std::unique_ptr(CudaDeviceActions[I]), GpuArchList[I], - DeviceTriple, /* AtTopLevel */ false)); + /* AtTopLevel */ false)); // Return a new host action that incorporates original host action and all // device actions. return std::unique_ptr( - new CudaHostAction(std::move(HostAction), DeviceActions, DeviceTriple)); + new CudaHostAction(std::move(HostAction), DeviceActions)); } -void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args, - const InputList &Inputs, ActionList &Actions) const { +void Driver::BuildActions(Compilation &C, const ToolChain &TC, + DerivedArgList &Args, const InputList &Inputs, + ActionList &Actions) const { llvm::PrettyStackTraceString CrashInfo("Building compilation actions"); if (!SuppressMissingInputWarning && Inputs.empty()) { @@ -1500,8 +1503,8 @@ Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current)); if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase) { - Current = buildCudaActions(*this, TC, Args, InputArg, - std::move(Current), Actions); + Current = + buildCudaActions(C, Args, InputArg, std::move(Current), Actions); if (!Current) break; } @@ -1803,7 +1806,7 @@ InputInfo II; // Append outputs of device jobs to the input list. for (const Action *DA : CHA->getDeviceActions()) { - BuildJobsForAction(C, DA, TC, "", AtTopLevel, + BuildJobsForAction(C, DA, TC, nullptr, AtTopLevel, /*MultipleArchs*/ false, LinkingOutput, II); CudaDeviceInputInfos.push_back(II); } @@ -1843,11 +1846,12 @@ } if (const CudaDeviceAction *CDA = dyn_cast(A)) { - BuildJobsForAction( - C, *CDA->begin(), - &getToolChain(C.getArgs(), llvm::Triple(CDA->getDeviceTriple())), - CDA->getGpuArchName(), CDA->isAtTopLevel(), - /*MultipleArchs*/ true, LinkingOutput, Result); + // Initial processing of CudaDeviceAction carries host params. + // Call BuildJobsForAction() again, now with correct device parameters. + assert(CDA->getGpuArchName() && "No GPU name in device action."); + BuildJobsForAction(C, *CDA->begin(), C.getCudaDeviceToolChain(), + CDA->getGpuArchName(), CDA->isAtTopLevel(), + /*MultipleArchs*/ true, LinkingOutput, Result); return; } Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -657,3 +657,6 @@ Res |= CFIICall; return Res; } + +void ToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, + ArgStringList &CC1Args) const {} Index: lib/Driver/ToolChains.h =================================================================== --- lib/Driver/ToolChains.h +++ lib/Driver/ToolChains.h @@ -166,6 +166,7 @@ std::string CudaLibPath; std::string CudaLibDevicePath; std::string CudaIncludePath; + llvm::StringMap CudaLibDeviceMap; public: CudaInstallationDetector(const Driver &D) : IsValid(false), D(D) {} @@ -185,6 +186,9 @@ /// \brief Get the detected Cuda device library path. StringRef getLibDevicePath() const { return CudaLibDevicePath; } /// \brief Get libdevice file for given architecture + StringRef getLibDeviceFile(StringRef Gpu) const { + return CudaLibDeviceMap.lookup(Gpu); + } }; CudaInstallationDetector CudaInstallation; @@ -784,6 +788,8 @@ void AddClangCXXStdlibIncludeArgs( const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; + void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const override; bool isPIEDefault() const override; SanitizerMask getSupportedSanitizers() const override; void addProfileRTLibs(const llvm::opt::ArgList &Args, Index: lib/Driver/ToolChains.cpp =================================================================== --- lib/Driver/ToolChains.cpp +++ lib/Driver/ToolChains.cpp @@ -1642,6 +1642,31 @@ D.getVFS().exists(CudaLibDevicePath))) continue; + const StringRef LibDeviceName = "libdevice."; + std::error_code EC; + for (llvm::sys::fs::directory_iterator LI(CudaLibDevicePath, EC), LE; + !EC && LI != LE; LI = LI.increment(EC)) { + StringRef FilePath = LI->path(); + StringRef FileName = llvm::sys::path::filename(FilePath); + // Process all bitcode filenames that look like libdevice.compute_XX.YY.bc + if (!(FileName.startswith(LibDeviceName) && FileName.endswith(".bc"))) + continue; + StringRef GpuArch = FileName.slice( + LibDeviceName.size(), FileName.find('.', LibDeviceName.size())); + CudaLibDeviceMap[GpuArch] = FilePath.str(); + // Insert map entries for specifc devices with this compute capability. + if (GpuArch == "compute_20") { + CudaLibDeviceMap["sm_20"] = FilePath; + CudaLibDeviceMap["sm_21"] = FilePath; + } else if (GpuArch == "compute_30") { + CudaLibDeviceMap["sm_30"] = FilePath; + CudaLibDeviceMap["sm_32"] = FilePath; + } else if (GpuArch == "compute_35") { + CudaLibDeviceMap["sm_35"] = FilePath; + CudaLibDeviceMap["sm_37"] = FilePath; + } + } + IsValid = true; break; } @@ -3926,6 +3951,18 @@ } } +void Linux::AddCudaIncludeArgs(const ArgList &DriverArgs, + ArgStringList &CC1Args) const { + if (DriverArgs.hasArg(options::OPT_nocudainc)) + return; + + if (CudaInstallation.isValid()) { + addSystemInclude(DriverArgs, CC1Args, CudaInstallation.getIncludePath()); + CC1Args.push_back("-include"); + CC1Args.push_back("cuda_runtime.h"); + } +} + bool Linux::isPIEDefault() const { return getSanitizerArgs().requiresPIE(); } SanitizerMask Linux::getSupportedSanitizers() const { @@ -4008,6 +4045,22 @@ llvm::opt::ArgStringList &CC1Args) const { Linux::addClangTargetOptions(DriverArgs, CC1Args); CC1Args.push_back("-fcuda-is-device"); + + if (DriverArgs.hasArg(options::OPT_nocudalib)) + return; + + std::string LibDeviceFile = CudaInstallation.getLibDeviceFile( + DriverArgs.getLastArgValue(options::OPT_march_EQ)); + if (!LibDeviceFile.empty()) { + CC1Args.push_back("-mlink-cuda-bitcode"); + CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); + + // Libdevice in CUDA-7.0 requires PTX version that's more recent + // than LLVM defaults to. Use PTX4.2 which is the PTX version that + // came with CUDA-7.0. + CC1Args.push_back("-target-feature"); + CC1Args.push_back("+ptx42"); + } } llvm::opt::DerivedArgList * Index: lib/Driver/Tools.h =================================================================== --- lib/Driver/Tools.h +++ lib/Driver/Tools.h @@ -57,7 +57,8 @@ const Driver &D, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs, const InputInfo &Output, - const InputInfoList &Inputs) const; + const InputInfoList &Inputs, + const ToolChain *AuxToolChain) const; void AddAArch64TargetArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const; Index: lib/Driver/Tools.cpp =================================================================== --- lib/Driver/Tools.cpp +++ lib/Driver/Tools.cpp @@ -248,7 +248,8 @@ const Driver &D, const ArgList &Args, ArgStringList &CmdArgs, const InputInfo &Output, - const InputInfoList &Inputs) const { + const InputInfoList &Inputs, + const ToolChain *AuxToolChain) const { Arg *A; CheckPreprocessingOptions(D, Args); @@ -441,11 +442,20 @@ addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH"); // Add C++ include arguments, if needed. - if (types::isCXX(Inputs[0].getType())) + if (types::isCXX(Inputs[0].getType())) { getToolChain().AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + if (AuxToolChain) + AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + } // Add system include arguments. getToolChain().AddClangSystemIncludeArgs(Args, CmdArgs); + if (AuxToolChain) + AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + + // Add CUDA include arguments, if needed. + if (types::isCuda(Inputs[0].getType())) + getToolChain().AddCudaIncludeArgs(Args, CmdArgs); } // FIXME: Move to target hook. @@ -3224,6 +3234,21 @@ CmdArgs.push_back("-triple"); CmdArgs.push_back(Args.MakeArgString(TripleStr)); + const ToolChain *AuxToolChain = nullptr; + if (IsCuda) { + if (&getToolChain() == C.getCudaDeviceToolChain()) + AuxToolChain = C.getCudaHostToolChain(); + else if (&getToolChain() == C.getCudaHostToolChain()) + AuxToolChain = C.getCudaDeviceToolChain(); + else + llvm_unreachable("Can't figure out CUDA compilation mode."); + if (AuxToolChain) { + CmdArgs.push_back("-aux-triple"); + CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str())); + } + CmdArgs.push_back("-fcuda-target-overloads"); + } + if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm || Triple.getArch() == llvm::Triple::thumb)) { unsigned Offset = Triple.getArch() == llvm::Triple::arm ? 4 : 6; @@ -4024,7 +4049,8 @@ // // FIXME: Support -fpreprocessed if (types::getPreprocessedType(InputType) != types::TY_INVALID) - AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs); + AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs, + AuxToolChain); // Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes // that "The compiler can only warn and ignore the option if not recognized". Index: lib/Headers/CMakeLists.txt =================================================================== --- lib/Headers/CMakeLists.txt +++ lib/Headers/CMakeLists.txt @@ -17,6 +17,7 @@ bmiintrin.h cpuid.h cuda_builtin_vars.h + cuda_runtime.h emmintrin.h f16cintrin.h float.h Index: lib/Headers/cuda_runtime.h =================================================================== --- /dev/null +++ lib/Headers/cuda_runtime.h @@ -0,0 +1,155 @@ +/*===---- cuda_runtime.h - CUDA runtime support ----------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_CUDA_RUNTIME_H__ +#define __CLANG_CUDA_RUNTIME_H__ + +#if defined(__PTX__) + +// Include some standard headers to avoid CUDA headers including them +// while some required macros (like __THROW) are in a weird state. +#include + +// Preserve common macros that will be changed below by us or by CUDA +// headers. +#pragma push_macro("__THROW") +#pragma push_macro("__CUDA_ARCH__") + +// WARNING: Preprocessor hacks below are based on specific of +// implementation of CUDA-7.x headers and are expected to break with +// any other version of CUDA headers. +#include "cuda.h" +#if !defined(CUDA_VERSION) +#error "cuda.h did not define CUDA_VERSION" +#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050 +#error "Unsupported CUDA version!" +#endif + +// Make largest subset of device functions available during host +// compilation -- SM_35 for the time being. +#ifndef __CUDA_ARCH__ +#define __CUDA_ARCH__ 350 +#endif + +#include "cuda_builtin_vars.h" + +// No need for device_launch_parameters.h as cuda_builtin_vars.h above +// has taken care of builtin variables declared in the file. +#define __DEVICE_LAUNCH_PARAMETERS_H__ + +// {math,device}_functions.h only have declarations of the +// functions. We don't need them as we're going to pull in their +// definitions from .hpp files. +#define __DEVICE_FUNCTIONS_H__ +#define __MATH_FUNCTIONS_H__ + +#undef __CUDACC__ +#define __CUDABE__ +#include "host_config.h" +#include "host_defines.h" +#include "driver_types.h" +#include "common_functions.h" + +#undef __CUDABE__ +#define __CUDACC__ +#include_next "cuda_runtime.h" + +#undef __CUDACC__ +#define __CUDABE__ +#include "crt/host_runtime.h" +#include "crt/device_runtime.h" + +// We need decls for functions in CUDA's libdevice woth __device__ +// attribute only. Alas they come either as __host__ __device__ or +// with no attributes at all. To work around that, define __CUDA_RTC__ +// which produces HD variant and undef __host__ which gives us desided +// decls with __device__ attribute. +#pragma push_macro("__host__") +#define __host__ +#define __CUDACC_RTC__ +#include "device_functions_decls.h" +#undef __CUDACC_RTC__ + +// Temporarily poison __host__ macro to ensure it's not used by any of +// the headers we're about to include. +#define __host__ UNEXPECTED_HOST_ATTRIBUTE + +// device_functions.hpp and math_functions*.hpp use 'static +// __forceinline__' (with no __device__) for definitions of device +// functions. Temporarily redefine __forceinline__ to include +// __device__. +#pragma push_macro("__forceinline__") +#define __forceinline__ __device__ __inline__ __attribute__((always_inline)) +#include "device_functions.hpp" +#include "math_functions.hpp" +#include "math_functions_dbl_ptx3.hpp" +#pragma pop_macro("__forceinline__") + +// For some reason single-argument variant is not always declared by +// CUDA headers. Alas, device_functions.hpp included below needs it. +static inline __device__ void __brkpt(int c) { __brkpt(); } + +// Now include *.hpp with definitions of various GPU functions. Alas, +// a lot of thins get declared/defined with __host__ attribute which +// we don't want and we have to define it out. We also have to include +// {device,math}_functions.hpp again in order to extract the other +// branch of #if/else inside. + +#define __host__ +#undef __CUDABE__ +#define __CUDACC__ +#undef __DEVICE_FUNCTIONS_HPP__ +#include "device_functions.hpp" +#include "device_atomic_functions.hpp" +#include "sm_20_atomic_functions.hpp" +#include "sm_32_atomic_functions.hpp" +#include "sm_20_intrinsics.hpp" +// sm_30_intrinsics.h has declarations that use default argument, so +// we have to include it and it will in turn include .hpp +#include "sm_30_intrinsics.h" +#include "sm_32_intrinsics.hpp" +#undef __MATH_FUNCTIONS_HPP__ +#include "math_functions.hpp" +#pragma pop_macro("__host__") + +// Restore state of __CUDA_ARCH__ and __THROW we had on entry. +#pragma pop_macro("__CUDA_ARCH__") +#pragma pop_macro("__THROW") + +// Set up compiler macros expected to be seen during compilation. +#undef __CUDABE__ +#define __CUDACC__ +#define __NVCC__ + +#if defined(__CUDA_ARCH__) +// We need to emit IR declaration for non-existing __nvvm_reflect to +// let backend know that it should be treated as const nothrow +// function which is implicitly assumed by NVVMReflect pass. +extern "C" __device__ __attribute__((const)) int __nvvm_reflect(const void *); +static __device__ __attribute__((used)) int __nvvm_reflect_anchor() { + return __nvvm_reflect("NONE"); +} +#endif + +#endif // __PTX__ +#endif // __CLANG_CUDA_RUNTIME_H__ Index: test/Driver/cuda-detect.cu =================================================================== --- test/Driver/cuda-detect.cu +++ test/Driver/cuda-detect.cu @@ -1,10 +1,61 @@ // REQUIRES: clang-driver // REQUIRES: x86-registered-target // +// # Check that we properly detect CUDA installation. // RUN: %clang -v --target=i386-unknown-linux \ -// RUN: --sysroot=/tmp/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA +// RUN: --sysroot=%S/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA +// RUN: %clang -v --target=i386-unknown-linux \ +// RUN: --sysroot=%S/Inputs/CUDA 2>&1 | FileCheck %s // RUN: %clang -v --target=i386-unknown-linux \ // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 | FileCheck %s +// Make sure we map libdevice bitcode files to proper GPUs. +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_21 \ +// RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix LIBDEVICE -check-prefix LIBDEVICE21 +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ +// RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix LIBDEVICE -check-prefix LIBDEVICE35 \ +// RUN: -check-prefix CUDAINC +// Verify that -nocudainc prevents adding include path to CUDA headers. +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ +// RUN: -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix LIBDEVICE -check-prefix LIBDEVICE35 \ +// RUN: -check-prefix NOCUDAINC + +// Verify that no options related to bitcode linking are passes if +// there's no bitcode file. +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \ +// RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix NOLIBDEVICE +// .. or if we explicitly passed -nocudalib +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ +// RUN: -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix NOLIBDEVICE +// Verify that we don't add include paths, link with libdevice or +// -include cuda_runtime without valid CUDA installation. +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ +// RUN: --cuda-path=%S/no-cuda-there %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix NOCUDAINC -check-prefix NOLIBDEVICE + // CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA/usr/local/cuda // NOCUDA-NOT: Found CUDA installation: + +// LIBDEVICE: "-triple" "nvptx-nvidia-cuda" +// LIBDEVICE-SAME: "-fcuda-is-device" +// LIBDEVICE-SAME: "-mlink-cuda-bitcode" +// LIBDEVICE21-SAME: libdevice.compute_20.10.bc +// LIBDEVICE35-SAME: libdevice.compute_35.10.bc +// LIBDEVICE-SAME: "-target-feature" "+ptx42" +// CUDAINC-SAME: "-internal-isystem" "{{.*}}/Inputs/CUDA/usr/local/cuda/include" +// CUDAINC-SAME: "-include" "cuda_runtime.h" +// NOCUDAINC-NOT: "-internal-isystem" "{{.*}}/Inputs/CUDA/usr/local/cuda/include" +// NOCUDAINC-NOT: "-include" "cuda_runtime.h" +// LIBDEVICE-SAME: "-x" "cuda" + +// NOLIBDEVICE: "-triple" "nvptx-nvidia-cuda" +// NOLIBDEVICE-SAME: "-fcuda-is-device" +// NOLIBDEVICE-NOT: "-mlink-cuda-bitcode-file" +// NOLIBDEVICE-NOT: libdevice.compute_{{.*}}.bc +// NOLIBDEVICE-NOT: "-target-feature" +// NOLIBDEVICE-SAME: "-x" "cuda" Index: test/Driver/cuda-options.cu =================================================================== --- test/Driver/cuda-options.cu +++ test/Driver/cuda-options.cu @@ -111,14 +111,6 @@ // Make sure we don't link anything. // RUN: -check-prefix CUDA-NL %s -// Match device-side preprocessor, and compiler phases with -save-temps -// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda" -// CUDA-D1S-SAME: "-fcuda-is-device" -// CUDA-D1S-SAME: "-x" "cuda" -// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda" -// CUDA-D1S-SAME: "-fcuda-is-device" -// CUDA-D1S-SAME: "-x" "cuda-cpp-output" - // --cuda-host-only should never trigger unused arg warning. // RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only -c %s 2>&1 | \ // RUN: FileCheck -check-prefix CUDA-NO-UNUSED-CHO %s @@ -133,34 +125,47 @@ // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only -x c -c %s 2>&1 | \ // RUN: FileCheck -check-prefix CUDA-UNUSED-CDO %s +// Match device-side preprocessor, and compiler phases with -save-temps +// CUDA-D1S: "-cc1" "-triple" "nvptx64-nvidia-cuda" +// CUDA-D1S-SAME: "-aux-triple" "x86_64--linux-gnu" +// CUDA-D1S-SAME: "-fcuda-is-device" +// CUDA-D1S-SAME: "-x" "cuda" + +// CUDA-D1S: "-cc1" "-triple" "nvptx64-nvidia-cuda" +// CUDA-D1S-SAME: "-aux-triple" "x86_64--linux-gnu" +// CUDA-D1S-SAME: "-fcuda-is-device" +// CUDA-D1S-SAME: "-x" "cuda-cpp-output" + // Match the job that produces PTX assembly -// CUDA-D1: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda" +// CUDA-D1: "-cc1" "-triple" "nvptx64-nvidia-cuda" +// CUDA-D1NS-SAME: "-aux-triple" "x86_64--linux-gnu" // CUDA-D1-SAME: "-fcuda-is-device" // CUDA-D1-SM35-SAME: "-target-cpu" "sm_35" // CUDA-D1-SAME: "-o" "[[GPUBINARY1:[^"]*]]" // CUDA-D1NS-SAME: "-x" "cuda" // CUDA-D1S-SAME: "-x" "ir" -// Match anothe device-side compilation -// CUDA-D2: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda" +// Match another device-side compilation +// CUDA-D2: "-cc1" "-triple" "nvptx64-nvidia-cuda" +// CUDA-D2-SAME: "-aux-triple" "x86_64--linux-gnu" // CUDA-D2-SAME: "-fcuda-is-device" // CUDA-D2-SM30-SAME: "-target-cpu" "sm_30" // CUDA-D2-SAME: "-o" "[[GPUBINARY2:[^"]*]]" // CUDA-D2-SAME: "-x" "cuda" // Match no device-side compilation -// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda" +// CUDA-ND-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda" // CUDA-ND-SAME-NOT: "-fcuda-is-device" // Match host-side preprocessor job with -save-temps -// CUDA-HS: "-cc1" "-triple" -// CUDA-HS-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda" +// CUDA-HS: "-cc1" "-triple" "x86_64--linux-gnu" +// CUDA-HS-SAME: "-aux-triple" "nvptx64-nvidia-cuda" // CUDA-HS-SAME-NOT: "-fcuda-is-device" // CUDA-HS-SAME: "-x" "cuda" // Match host-side compilation -// CUDA-H: "-cc1" "-triple" -// CUDA-H-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda" +// CUDA-H: "-cc1" "-triple" "x86_64--linux-gnu" +// CUDA-H-SAME: "-aux-triple" "nvptx64-nvidia-cuda" // CUDA-H-SAME-NOT: "-fcuda-is-device" // CUDA-H-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]" // CUDA-HNS-SAME: "-x" "cuda" Index: test/SemaCUDA/function-target-hd.cu =================================================================== --- test/SemaCUDA/function-target-hd.cu +++ test/SemaCUDA/function-target-hd.cu @@ -8,9 +8,9 @@ // host device functions are not allowed to call device functions. // RUN: %clang_cc1 -fsyntax-only -verify %s -// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s +// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -triple nvptx-unknown-cuda -verify %s // RUN: %clang_cc1 -fsyntax-only -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD -// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD +// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -triple nvptx-unknown-cuda -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD #include "Inputs/cuda.h" Index: unittests/ASTMatchers/ASTMatchersTest.h =================================================================== --- unittests/ASTMatchers/ASTMatchersTest.h +++ unittests/ASTMatchers/ASTMatchersTest.h @@ -178,6 +178,7 @@ Args.push_back("-xcuda"); Args.push_back("-fno-ms-extensions"); Args.push_back("--cuda-host-only"); + Args.push_back("-nocudainc"); Args.push_back(CompileArg); if (!runToolOnCodeWithArgs(Factory->create(), CudaHeader + Code, Args)) {