Index: include/clang/Basic/LangOptions.def =================================================================== --- include/clang/Basic/LangOptions.def +++ include/clang/Basic/LangOptions.def @@ -204,6 +204,7 @@ LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__") LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero") LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") +LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code") LANGOPT(SizedDeallocation , 1, 0, "sized deallocation") LANGOPT(AlignedAllocation , 1, 0, "aligned allocation") Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -566,6 +566,9 @@ def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">, Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">; def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">; +def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option, HelpHidden]>, + HelpText<"Generate relocatable device code, also known as separate compilation mode.">; +def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">; def dA : Flag<["-"], "dA">, Group; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, HelpText<"Print macro definitions in -E mode in addition to normal output">; Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -4658,14 +4658,20 @@ CmdArgs.push_back(Args.MakeArgString(Flags)); } - // Host-side cuda compilation receives device-side outputs as Inputs[1...]. - // Include them with -fcuda-include-gpubinary. - if (IsCuda && Inputs.size() > 1) - for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) { - CmdArgs.push_back("-fcuda-include-gpubinary"); - CmdArgs.push_back(I->getFilename()); + if (IsCuda) { + // Host-side cuda compilation receives device-side outputs as Inputs[1...]. + // Include them with -fcuda-include-gpubinary. + if (Inputs.size() > 1) { + for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) { + CmdArgs.push_back("-fcuda-include-gpubinary"); + CmdArgs.push_back(I->getFilename()); + } } + if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false)) + CmdArgs.push_back("-fcuda-rdc"); + } + // OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path // to specify the result of the compile phase on the host, so the meaningful // device declarations can be identified. Also, -fopenmp-is-device is passed Index: lib/Driver/ToolChains/Cuda.cpp =================================================================== --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -355,11 +355,17 @@ for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas)) CmdArgs.push_back(Args.MakeArgString(A)); - // In OpenMP we need to generate relocatable code. - if (JA.isOffloading(Action::OFK_OpenMP) && - Args.hasFlag(options::OPT_fopenmp_relocatable_target, - options::OPT_fnoopenmp_relocatable_target, - /*Default=*/ true)) + bool Relocatable = false; + if (JA.isOffloading(Action::OFK_OpenMP)) + // In OpenMP we need to generate relocatable code. + Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target, + options::OPT_fnoopenmp_relocatable_target, + /*Default=*/true); + else if (JA.isOffloading(Action::OFK_Cuda)) + Relocatable = Args.hasFlag(options::OPT_fcuda_rdc, + options::OPT_fno_cuda_rdc, /*Default=*/false); + + if (Relocatable) CmdArgs.push_back("-c"); const char *Exec; @@ -540,6 +546,10 @@ if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, options::OPT_fno_cuda_approx_transcendentals, false)) CC1Args.push_back("-fcuda-approx-transcendentals"); + + if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, + false)) + CC1Args.push_back("-fcuda-rdc"); } if (DriverArgs.hasArg(options::OPT_nocudalib)) Index: lib/Frontend/CompilerInvocation.cpp =================================================================== --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -2074,6 +2074,8 @@ if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals)) Opts.CUDADeviceApproxTranscendentals = 1; + Opts.CUDARelocatableDeviceCode = Args.hasArg(OPT_fcuda_rdc); + if (Opts.ObjC1) { if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) { StringRef value = arg->getValue(); Index: test/Driver/cuda-external-tools.cu =================================================================== --- test/Driver/cuda-external-tools.cu +++ test/Driver/cuda-external-tools.cu @@ -18,6 +18,9 @@ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s // RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s +// Generating relocatable device code +// RUN: %clang -### -target x86_64-linux-gnu -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s // With debugging enabled, ptxas should be run with with no ptxas optimizations. // RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-device-debug -O2 -c %s 2>&1 \ @@ -42,14 +45,23 @@ // Regular compile targeting sm_35. // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s +// Separate compilation targeting sm_35. +// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s // 32-bit compile. // RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s +// 32-bit compile when generating relocatable device code. +// RUN: %clang -### -target i386-linux-gnu -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s // Compile with -fintegrated-as. This should still cause us to invoke ptxas. // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s +// Check that we still pass -c when generating relocatable device code. +// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s // Check -Xcuda-ptxas and -Xcuda-fatbinary // RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \ @@ -64,6 +76,14 @@ // RUN: %clang -### -target i386-apple-macosx -c %s 2>&1 \ // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s +// Check relocatable device code generation on MacOS. +// RUN: %clang -### -target x86_64-apple-macosx -O0 -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s +// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s +// RUN: %clang -### -target i386-apple-macosx -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s + // Check that CLANG forwards the -v flag to PTXAS. // RUN: %clang -### -save-temps -no-canonical-prefixes -v %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-PTXAS-VERBOSE %s @@ -76,6 +96,8 @@ // SM35-SAME: "-target-cpu" "sm_35" // SM20-SAME: "-o" "[[PTXFILE:[^"]*]]" // SM35-SAME: "-o" "[[PTXFILE:[^"]*]]" +// RDC-SAME: "-fcuda-rdc" +// CHECK-NOT: "-fcuda-rdc" // Match the call to ptxas (which assembles PTX to SASS). // CHECK: ptxas @@ -97,6 +119,8 @@ // CHECK-SAME: "[[PTXFILE]]" // PTXAS-EXTRA-SAME: "-foo1" // PTXAS-EXTRA-SAME: "-foo2" +// RDC-SAME: "-c" +// CHECK-NOT: "-c" // Match the call to fatbinary (which combines all our PTX and SASS into one // blob). @@ -117,5 +141,7 @@ // ARCH64-SAME: "-triple" "x86_64- // ARCH32-SAME: "-triple" "i386- // CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]" +// RDC-SAME: "-fcuda-rdc" +// CHECK-NOT: "-fcuda-rdc" // CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v"