diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -266,6 +266,7 @@
 LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP")
 LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
 LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
+LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")
 
 LANGOPT(SYCLIsDevice      , 1, 0, "Generate code for SYCL device")
 LANGOPT(SYCLIsHost        , 1, 0, "SYCL host compilation")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2526,9 +2526,9 @@
   PosFlag<SetTrue, [CC1Option]>, NegFlag<SetFalse>, BothFlags<[NoArgumentUnused, HelpHidden]>>;
 def static_openmp: Flag<["-"], "static-openmp">,
   HelpText<"Use the static host OpenMP runtime while linking.">;
-def offload_new_driver : Flag<["--"], "offload-new-driver">, Flags<[CC1Option]>, Group<Action_Group>,
-  HelpText<"Use the new driver for offloading compilation.">;
-def no_offload_new_driver : Flag<["--"], "no-offload-new-driver">, Flags<[CC1Option]>, Group<Action_Group>,
+def offload_new_driver : Flag<["--"], "offload-new-driver">, Flags<[CC1Option]>, Group<f_Group>,
+  MarshallingInfoFlag<LangOpts<"OffloadingNewDriver">>, HelpText<"Use the new driver for offloading compilation.">;
+def no_offload_new_driver : Flag<["--"], "no-offload-new-driver">, Flags<[CC1Option]>, Group<f_Group>,
   HelpText<"Don't Use the new driver for offloading compilation.">;
 def offload_device_only : Flag<["--"], "offload-device-only">,
   HelpText<"Only compile for the offloading device.">;
@@ -2543,7 +2543,7 @@
 def cuda_compile_host_device : Flag<["--"], "cuda-compile-host-device">, Alias<offload_host_device>,
   HelpText<"Compile CUDA code for both host and device (default). Has no "
            "effect on non-CUDA compilations.">;
-def fopenmp_new_driver : Flag<["-"], "fopenmp-new-driver">, Flags<[CC1Option]>, Group<Action_Group>,
+def fopenmp_new_driver : Flag<["-"], "fopenmp-new-driver">, Flags<[CC1Option]>, Group<f_Group>,
   HelpText<"Use the new driver for OpenMP offloading.">;
 def fno_openmp_new_driver : Flag<["-"], "fno-openmp-new-driver">, Flags<[CC1Option]>, Group<Action_Group>,
   Alias<no_offload_new_driver>, HelpText<"Don't use the new driver for OpenMP offloading.">;
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -157,6 +157,8 @@
   llvm::Function *makeModuleDtorFunction();
   /// Transform managed variables for device compilation.
   void transformManagedVars();
+  /// Create offloading entries to register globals in RDC mode.
+  void createOffloadingEntries();
 
 public:
   CGNVCUDARuntime(CodeGenModule &CGM);
@@ -210,7 +212,8 @@
 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
       TheModule(CGM.getModule()),
-      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
+      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode ||
+                            CGM.getLangOpts().OffloadingNewDriver),
       DeviceMC(InitDeviceMC(CGM)) {
   CodeGen::CodeGenTypes &Types = CGM.getTypes();
   ASTContext &Ctx = CGM.getContext();
@@ -1107,6 +1110,40 @@
   }
 }
 
+// Creates offloading entries for all the kernels and globals that must be
+// registered. The linker will provide a pointer to this section so we can
+// register the symbols with the linked device image.
+void CGNVCUDARuntime::createOffloadingEntries() {
+  llvm::OpenMPIRBuilder OMPBuilder(CGM.getModule());
+  OMPBuilder.initialize();
+
+  StringRef Section = "cuda_offloading_entries";
+  for (KernelInfo &I : EmittedKernels)
+    OMPBuilder.emitOffloadingEntry(KernelHandles[I.Kernel],
+                                   getDeviceSideName(cast<NamedDecl>(I.D)), 0,
+                                   DeviceVarFlags::OffloadGlobalEntry, Section);
+
+  for (VarInfo &I : DeviceVars) {
+    uint64_t VarSize =
+        CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
+    if (I.Flags.getKind() == DeviceVarFlags::Variable) {
+      OMPBuilder.emitOffloadingEntry(
+          I.Var, getDeviceSideName(I.D), VarSize,
+          I.Flags.isManaged() ? DeviceVarFlags::OffloadGlobalManagedEntry
+                              : DeviceVarFlags::OffloadGlobalEntry,
+          Section);
+    } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
+      OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize,
+                                     DeviceVarFlags::OffloadGlobalSurfaceEntry,
+                                     Section);
+    } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
+      OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize,
+                                     DeviceVarFlags::OffloadGlobalTextureEntry,
+                                     Section);
+    }
+  }
+}
+
 // Returns module constructor to be added.
 llvm::Function *CGNVCUDARuntime::finalizeModule() {
   if (CGM.getLangOpts().CUDAIsDevice) {
@@ -1135,7 +1172,11 @@
     }
     return nullptr;
   }
-  return makeModuleCtorFunction();
+  if (!(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
+    return makeModuleCtorFunction();
+
+  createOffloadingEntries();
+  return nullptr;
 }
 
 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -52,6 +52,19 @@
       Texture,  // Builtin texture
     };
 
+    /// The kind flag for an offloading entry.
+    enum OffloadEntryKindFlag : uint32_t {
+      /// Mark the entry as a global entry. This indicates the presense of a
+      /// kernel if the size size field is zero and a variable otherwise.
+      OffloadGlobalEntry = 0x0,
+      /// Mark the entry as a managed global variable.
+      OffloadGlobalManagedEntry = 0x1,
+      /// Mark the entry as a surface variable.
+      OffloadGlobalSurfaceEntry = 0x2,
+      /// Mark the entry as a texture variable.
+      OffloadGlobalTextureEntry = 0x3,
+    };
+
   private:
     unsigned Kind : 2;
     unsigned Extern : 1;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6084,6 +6084,10 @@
                        options::OPT_fno_openmp_extensions);
   }
 
+  // Forward the new driver to change offloading code generation.
+  if (Args.hasArg(options::OPT_offload_new_driver))
+    CmdArgs.push_back("--offload-new-driver");
+
   SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);
 
   const XRayArgs &XRay = TC.getXRayArgs();
diff --git a/clang/test/CodeGenCUDA/offloading-entries.cu b/clang/test/CodeGenCUDA/offloading-entries.cu
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenCUDA/offloading-entries.cu
@@ -0,0 +1,33 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu \
+// RUN:   --offload-new-driver -emit-llvm -o - -x cuda  %s | FileCheck \
+// RUN:   --check-prefix=HOST %s
+
+#include "Inputs/cuda.h"
+
+//.
+// HOST: @x = internal global i32 undef, align 4
+// HOST: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
+// HOST: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
+// HOST: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
+// HOST: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
+// HOST: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
+// HOST: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
+//.
+// HOST-LABEL: @_Z18__device_stub__foov(
+// HOST-NEXT:  entry:
+// HOST-NEXT:    [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__foov)
+// HOST-NEXT:    br label [[SETUP_END:%.*]]
+// HOST:       setup.end:
+// HOST-NEXT:    ret void
+//
+__global__ void foo() {}
+// HOST-LABEL: @_Z18__device_stub__barv(
+// HOST-NEXT:  entry:
+// HOST-NEXT:    [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv)
+// HOST-NEXT:    br label [[SETUP_END:%.*]]
+// HOST:       setup.end:
+// HOST-NEXT:    ret void
+//
+__global__ void bar() {}
+__device__ int x = 1;