Index: include/clang/Driver/ToolChain.h =================================================================== --- include/clang/Driver/ToolChain.h +++ include/clang/Driver/ToolChain.h @@ -411,7 +411,8 @@ /// \brief Add options that need to be passed to cc1 for this target. virtual void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const; + llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadKind) const; /// \brief Add warning options that need to be passed to cc1 for this target. virtual void addClangWarningOptions(llvm::opt::ArgStringList &CC1Args) const; Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -544,9 +544,9 @@ // Each toolchain should provide the appropriate include flags. } -void ToolChain::addClangTargetOptions(const ArgList &DriverArgs, - ArgStringList &CC1Args) const { -} +void ToolChain::addClangTargetOptions( + const ArgList &DriverArgs, ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadKind) const {} void ToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {} Index: lib/Driver/ToolChains/BareMetal.h =================================================================== --- lib/Driver/ToolChains/BareMetal.h +++ lib/Driver/ToolChains/BareMetal.h @@ -54,7 +54,8 @@ void AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; + llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadKind) const override; std::string findLibCxxIncludePath(ToolChain::CXXStdlibType LibType) const; void AddClangCXXStdlibIncludeArgs( const llvm::opt::ArgList &DriverArgs, Index: lib/Driver/ToolChains/BareMetal.cpp =================================================================== --- lib/Driver/ToolChains/BareMetal.cpp +++ lib/Driver/ToolChains/BareMetal.cpp @@ -98,7 +98,8 @@ } void BareMetal::addClangTargetOptions(const ArgList &DriverArgs, - ArgStringList &CC1Args) const { + ArgStringList &CC1Args, + Action::OffloadKind) const { CC1Args.push_back("-nostdsysteminc"); } Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -2574,7 +2574,8 @@ AsynchronousUnwindTables)) CmdArgs.push_back("-munwind-tables"); - getToolChain().addClangTargetOptions(Args, CmdArgs); + getToolChain().addClangTargetOptions(Args, CmdArgs, + JA.getOffloadingDeviceKind()); if (Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) { CmdArgs.push_back("-mlimit-float-precision"); Index: lib/Driver/ToolChains/Cuda.h =================================================================== --- lib/Driver/ToolChains/Cuda.h +++ lib/Driver/ToolChains/Cuda.h @@ -130,7 +130,8 @@ TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch, Action::OffloadKind DeviceOffloadKind) const override; void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; + llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadKind) const override; // Never try to use the integrated assembler with CUDA; always fork out to // ptxas. Index: lib/Driver/ToolChains/Cuda.cpp =================================================================== --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -212,8 +212,21 @@ static_cast(getToolChain()); assert(TC.getTriple().isNVPTX() && "Wrong platform"); + StringRef GPUArchName; + std::vector GPUArchNames; + // If this is an OpenMP action we need to extract the device architecture from + // the -march option. + if (JA.isDeviceOffloading(Action::OFK_OpenMP)) { + GPUArchNames = Args.getAllArgValues(options::OPT_march_EQ); + assert(GPUArchNames.size() == 1 && + "Exactly one GPU Arch required for ptxas."); + // TODO: get compute capability if a flag is used to pass it to the driver. + GPUArchName = GPUArchNames[0]; + } else + GPUArchName = JA.getOffloadingArch(); + // Obtain architecture from the action. - CudaArch gpu_arch = StringToCudaArch(JA.getOffloadingArch()); + CudaArch gpu_arch = StringToCudaArch(GPUArchName); assert(gpu_arch != CudaArch::UNKNOWN && "Device action expected to have an architecture."); @@ -346,26 +359,32 @@ void CudaToolChain::addClangTargetOptions( const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const { - HostTC.addClangTargetOptions(DriverArgs, CC1Args); + llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadingKind) const { + HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind); - CC1Args.push_back("-fcuda-is-device"); + StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ); + assert(!GpuArch.empty() && "Must have an explicit GPU arch."); + assert((DeviceOffloadingKind == Action::OFK_OpenMP || + DeviceOffloadingKind == Action::OFK_Cuda) && + "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs."); - if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero, - options::OPT_fno_cuda_flush_denormals_to_zero, false)) - CC1Args.push_back("-fcuda-flush-denormals-to-zero"); + if (DeviceOffloadingKind == Action::OFK_Cuda) { + CC1Args.push_back("-fcuda-is-device"); - if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, - options::OPT_fno_cuda_approx_transcendentals, false)) - CC1Args.push_back("-fcuda-approx-transcendentals"); + if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero, + options::OPT_fno_cuda_flush_denormals_to_zero, false)) + CC1Args.push_back("-fcuda-flush-denormals-to-zero"); - if (DriverArgs.hasArg(options::OPT_nocudalib)) - return; + if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, + options::OPT_fno_cuda_approx_transcendentals, false)) + CC1Args.push_back("-fcuda-approx-transcendentals"); - StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ); - assert(!GpuArch.empty() && "Must have an explicit GPU arch."); - std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch); + if (DriverArgs.hasArg(options::OPT_nocudalib)) + return; + } + std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch); if (LibDeviceFile.empty()) { getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch; return; @@ -404,6 +423,29 @@ const OptTable &Opts = getDriver().getOpts(); + // For OpenMP device offloading, append derived arguments along with + // the compute capability. Make sure flags are not duplicated. + if (DeviceOffloadKind == Action::OFK_OpenMP) { + for (Arg *A : Args){ + bool IsDuplicate = false; + for (Arg *DALArg : *DAL){ + if (A == DALArg) { + IsDuplicate = true; + break; + } + } + if (!IsDuplicate) + DAL->append(A); + } + + // TODO: get the compute capability from offloading arguments when not + // using the default compute capability of sm_20. + if (Args.getAllArgValues(options::OPT_march_EQ).empty()) + DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), + "sm_20"); + return DAL; + } + for (Arg *A : Args) { if (A->getOption().matches(options::OPT_Xarch__)) { // Skip this argument unless the architecture matches BoundArch Index: lib/Driver/ToolChains/Fuchsia.h =================================================================== --- lib/Driver/ToolChains/Fuchsia.h +++ lib/Driver/ToolChains/Fuchsia.h @@ -55,7 +55,8 @@ GetCXXStdlibType(const llvm::opt::ArgList &Args) const override; void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; + llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadKind) const override; void AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; Index: lib/Driver/ToolChains/Fuchsia.cpp =================================================================== --- lib/Driver/ToolChains/Fuchsia.cpp +++ lib/Driver/ToolChains/Fuchsia.cpp @@ -172,7 +172,8 @@ } void Fuchsia::addClangTargetOptions(const ArgList &DriverArgs, - ArgStringList &CC1Args) const { + ArgStringList &CC1Args, + Action::OffloadKind) const { if (DriverArgs.hasFlag(options::OPT_fuse_init_array, options::OPT_fno_use_init_array, true)) CC1Args.push_back("-fuse-init-array"); Index: lib/Driver/ToolChains/Gnu.h =================================================================== --- lib/Driver/ToolChains/Gnu.h +++ lib/Driver/ToolChains/Gnu.h @@ -341,7 +341,8 @@ : Generic_GCC(D, Triple, Args) {} void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; + llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadKind) const override; }; } // end namespace toolchains Index: lib/Driver/ToolChains/Gnu.cpp =================================================================== --- lib/Driver/ToolChains/Gnu.cpp +++ lib/Driver/ToolChains/Gnu.cpp @@ -2461,7 +2461,8 @@ void Generic_ELF::anchor() {} void Generic_ELF::addClangTargetOptions(const ArgList &DriverArgs, - ArgStringList &CC1Args) const { + ArgStringList &CC1Args, + Action::OffloadKind) const { const Generic_GCC::GCCVersion &V = GCCInstallation.getVersion(); bool UseInitArrayDefault = getTriple().getArch() == llvm::Triple::aarch64 || Index: lib/Driver/ToolChains/Hexagon.h =================================================================== --- lib/Driver/ToolChains/Hexagon.h +++ lib/Driver/ToolChains/Hexagon.h @@ -69,7 +69,8 @@ ~HexagonToolChain() override; void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; + llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadKind) const override; void AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; Index: lib/Driver/ToolChains/Hexagon.cpp =================================================================== --- lib/Driver/ToolChains/Hexagon.cpp +++ lib/Driver/ToolChains/Hexagon.cpp @@ -428,7 +428,8 @@ } void HexagonToolChain::addClangTargetOptions(const ArgList &DriverArgs, - ArgStringList &CC1Args) const { + ArgStringList &CC1Args, + Action::OffloadKind) const { if (DriverArgs.hasArg(options::OPT_ffp_contract)) return; unsigned OptLevel = getOptimizationLevel(DriverArgs); Index: lib/Driver/ToolChains/WebAssembly.h =================================================================== --- lib/Driver/ToolChains/WebAssembly.h +++ lib/Driver/ToolChains/WebAssembly.h @@ -53,7 +53,8 @@ bool SupportsProfiling() const override; bool HasNativeLLVMSupport() const override; void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; + llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadKind) const override; RuntimeLibType GetDefaultRuntimeLibType() const override; CXXStdlibType GetCXXStdlibType(const llvm::opt::ArgList &Args) const override; void AddClangSystemIncludeArgs( Index: lib/Driver/ToolChains/WebAssembly.cpp =================================================================== --- lib/Driver/ToolChains/WebAssembly.cpp +++ lib/Driver/ToolChains/WebAssembly.cpp @@ -134,7 +134,8 @@ bool WebAssembly::HasNativeLLVMSupport() const { return true; } void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs, - ArgStringList &CC1Args) const { + ArgStringList &CC1Args, + Action::OffloadKind) const { if (DriverArgs.hasFlag(clang::driver::options::OPT_fuse_init_array, options::OPT_fno_use_init_array, true)) CC1Args.push_back("-fuse-init-array"); Index: lib/Driver/ToolChains/XCore.h =================================================================== --- lib/Driver/ToolChains/XCore.h +++ lib/Driver/ToolChains/XCore.h @@ -67,7 +67,8 @@ AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; + llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadKind) const override; void AddClangCXXStdlibIncludeArgs( const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; Index: lib/Driver/ToolChains/XCore.cpp =================================================================== --- lib/Driver/ToolChains/XCore.cpp +++ lib/Driver/ToolChains/XCore.cpp @@ -124,7 +124,8 @@ } void XCoreToolChain::addClangTargetOptions(const ArgList &DriverArgs, - ArgStringList &CC1Args) const { + ArgStringList &CC1Args, + Action::OffloadKind) const { CC1Args.push_back("-nostdsysteminc"); } Index: test/Driver/openmp-offload.c =================================================================== --- test/Driver/openmp-offload.c +++ test/Driver/openmp-offload.c @@ -607,3 +607,12 @@ // CHK-FOPENMP-IS-DEVICE: clang{{.*}}.i" {{.*}}" "-fopenmp-is-device" // CHK-FOPENMP-IS-DEVICE-NEXT: clang{{.*}}.bc" {{.*}}.i" "-fopenmp-is-device" "-fopenmp-host-ir-file-path" // CHK-FOPENMP-IS-DEVICE-NEXT: clang{{.*}}.s" {{.*}}.bc" "-fopenmp-is-device" + +/// ########################################################################### + +/// Check -march propagates compute capability to device offloading toolchain. +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -save-temps -no-canonical-prefixes -march=sm_35 %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-COMPUTE-CAPABILITY %s + +// CHK-COMPUTE-CAPABILITY: ptxas{{.*}}" "--gpu-name" "sm_35" +// CHK-COMPUTE-CAPABILITY-NEXT: nvlink{{.*}}" "-arch" "sm_35"