Index: include/clang/Basic/LangOptions.def
===================================================================
--- include/clang/Basic/LangOptions.def
+++ include/clang/Basic/LangOptions.def
@@ -204,6 +204,7 @@
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
 LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")
 LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
+LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code")
 
 LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
 LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -566,6 +566,9 @@
 def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
   Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
 def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
+def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option, HelpHidden]>,
+  HelpText<"Generate relocatable device code, also known as separate compilation mode.">;
+def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">;
 def dA : Flag<["-"], "dA">, Group<d_Group>;
 def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
   HelpText<"Print macro definitions in -E mode in addition to normal output">;
Index: lib/Driver/ToolChains/Clang.cpp
===================================================================
--- lib/Driver/ToolChains/Clang.cpp
+++ lib/Driver/ToolChains/Clang.cpp
@@ -4658,14 +4658,20 @@
     CmdArgs.push_back(Args.MakeArgString(Flags));
   }
 
-  // Host-side cuda compilation receives device-side outputs as Inputs[1...].
-  // Include them with -fcuda-include-gpubinary.
-  if (IsCuda && Inputs.size() > 1)
-    for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {
-      CmdArgs.push_back("-fcuda-include-gpubinary");
-      CmdArgs.push_back(I->getFilename());
+  if (IsCuda) {
+    // Host-side cuda compilation receives device-side outputs as Inputs[1...].
+    // Include them with -fcuda-include-gpubinary.
+    if (Inputs.size() > 1) {
+      for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {
+        CmdArgs.push_back("-fcuda-include-gpubinary");
+        CmdArgs.push_back(I->getFilename());
+      }
     }
 
+    if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
+      CmdArgs.push_back("-fcuda-rdc");
+  }
+
   // OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path
   // to specify the result of the compile phase on the host, so the meaningful
   // device declarations can be identified. Also, -fopenmp-is-device is passed
Index: lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -355,11 +355,17 @@
   for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
     CmdArgs.push_back(Args.MakeArgString(A));
 
-  // In OpenMP we need to generate relocatable code.
-  if (JA.isOffloading(Action::OFK_OpenMP) &&
-      Args.hasFlag(options::OPT_fopenmp_relocatable_target,
-                   options::OPT_fnoopenmp_relocatable_target,
-                   /*Default=*/ true))
+  bool Relocatable = false;
+  if (JA.isOffloading(Action::OFK_OpenMP))
+    // In OpenMP we need to generate relocatable code.
+    Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target,
+                               options::OPT_fnoopenmp_relocatable_target,
+                               /*Default=*/true);
+  else if (JA.isOffloading(Action::OFK_Cuda))
+    Relocatable = Args.hasFlag(options::OPT_fcuda_rdc,
+                               options::OPT_fno_cuda_rdc, /*Default=*/false);
+
+  if (Relocatable)
     CmdArgs.push_back("-c");
 
   const char *Exec;
@@ -540,6 +546,10 @@
     if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
                            options::OPT_fno_cuda_approx_transcendentals, false))
       CC1Args.push_back("-fcuda-approx-transcendentals");
+
+    if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+                           false))
+      CC1Args.push_back("-fcuda-rdc");
   }
 
   if (DriverArgs.hasArg(options::OPT_nocudalib))
Index: lib/Frontend/CompilerInvocation.cpp
===================================================================
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -2074,6 +2074,8 @@
   if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
     Opts.CUDADeviceApproxTranscendentals = 1;
 
+  Opts.CUDARelocatableDeviceCode = Args.hasArg(OPT_fcuda_rdc);
+
   if (Opts.ObjC1) {
     if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
       StringRef value = arg->getValue();
Index: test/Driver/cuda-external-tools.cu
===================================================================
--- test/Driver/cuda-external-tools.cu
+++ test/Driver/cuda-external-tools.cu
@@ -18,6 +18,9 @@
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s
 // RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s
+// Generating relocatable device code
+// RUN: %clang -### -target x86_64-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
 
 // With debugging enabled, ptxas should be run with with no ptxas optimizations.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-device-debug -O2 -c %s 2>&1 \
@@ -42,14 +45,23 @@
 // Regular compile targeting sm_35.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s
+// Separate compilation targeting sm_35.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
 
 // 32-bit compile.
 // RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
+// 32-bit compile when generating relocatable device code.
+// RUN: %clang -### -target i386-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
 
 // Compile with -fintegrated-as.  This should still cause us to invoke ptxas.
 // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s
+// Check that we still pass -c when generating relocatable device code.
+// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
 
 // Check -Xcuda-ptxas and -Xcuda-fatbinary
 // RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \
@@ -64,6 +76,14 @@
 // RUN: %clang -### -target i386-apple-macosx -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
 
+// Check relocatable device code generation on MacOS.
+// RUN: %clang -### -target x86_64-apple-macosx -O0 -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
+// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
+// RUN: %clang -### -target i386-apple-macosx -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
+
 // Check that CLANG forwards the -v flag to PTXAS.
 // RUN:   %clang -### -save-temps -no-canonical-prefixes -v %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-PTXAS-VERBOSE %s
@@ -76,6 +96,8 @@
 // SM35-SAME: "-target-cpu" "sm_35"
 // SM20-SAME: "-o" "[[PTXFILE:[^"]*]]"
 // SM35-SAME: "-o" "[[PTXFILE:[^"]*]]"
+// RDC-SAME: "-fcuda-rdc"
+// CHECK-NOT: "-fcuda-rdc"
 
 // Match the call to ptxas (which assembles PTX to SASS).
 // CHECK: ptxas
@@ -97,6 +119,8 @@
 // CHECK-SAME: "[[PTXFILE]]"
 // PTXAS-EXTRA-SAME: "-foo1"
 // PTXAS-EXTRA-SAME: "-foo2"
+// RDC-SAME: "-c"
+// CHECK-NOT: "-c"
 
 // Match the call to fatbinary (which combines all our PTX and SASS into one
 // blob).
@@ -117,5 +141,7 @@
 // ARCH64-SAME: "-triple" "x86_64-
 // ARCH32-SAME: "-triple" "i386-
 // CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
+// RDC-SAME: "-fcuda-rdc"
+// CHECK-NOT: "-fcuda-rdc"
 
 // CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v"