diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -427,10 +427,11 @@
   /// \param Args - The input arguments.
   /// \param Input - The input type and arguments
   /// \param HostAction - The host action used in the offloading toolchain.
+  /// \param DeviceAction - The leftover offload action if not merged.
   Action *BuildOffloadingActions(Compilation &C,
                                  llvm::opt::DerivedArgList &Args,
-                                 const InputTy &Input,
-                                 Action *HostAction) const;
+                                 const InputTy &Input, Action *HostAction,
+                                 Action *&DeviceAction) const;
 
   /// Check that the file referenced by Value exists. If it doesn't,
   /// issue a diagnostic and return false.
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -3905,6 +3905,9 @@
     // Build the pipeline for this file.
     Action *Current = C.MakeAction<InputAction>(*InputArg, InputType);
 
+    // The leftover offloading action not merged into the host, if any.
+    Action *CurrentOffload = nullptr;
+
     // Use the current host action in any of the offloading actions, if
     // required.
     if (!Args.hasArg(options::OPT_fopenmp_new_driver))
@@ -3958,7 +3961,7 @@
       // Try to build the offloading actions and add the result as a dependency
       // to the host.
       if (Args.hasArg(options::OPT_fopenmp_new_driver))
-        Current = BuildOffloadingActions(C, Args, I, Current);
+        Current = BuildOffloadingActions(C, Args, I, Current, CurrentOffload);
 
       // FIXME: Should we include any prior module file outputs as inputs of
       // later actions in the same command line?
@@ -3987,6 +3990,12 @@
         break;
     }
 
+    // Bundle any leftover device output with the host action.
+    if (Current && CurrentOffload) {
+      ActionList AL{CurrentOffload, Current};
+      Current = C.MakeAction<OffloadBundlingJobAction>(AL);
+    }
+
     // If we ended with something, add to the output list.
     if (Current)
       Actions.push_back(Current);
@@ -4112,8 +4121,8 @@
 
 Action *Driver::BuildOffloadingActions(Compilation &C,
                                        llvm::opt::DerivedArgList &Args,
-                                       const InputTy &Input,
-                                       Action *HostAction) const {
+                                       const InputTy &Input, Action *HostAction,
+                                       Action *&DeviceAction) const {
   if (!isa<CompileJobAction>(HostAction))
     return HostAction;
 
@@ -4124,6 +4133,8 @@
 
   const Action::OffloadKind OffloadKinds[] = {Action::OFK_OpenMP};
 
+  auto PL = types::getCompilationPhases(*this, Args, InputType);
+
   for (Action::OffloadKind Kind : OffloadKinds) {
     SmallVector<const ToolChain *, 2> ToolChains;
     ActionList DeviceActions;
@@ -4141,8 +4152,6 @@
     if (DeviceActions.empty())
       return HostAction;
 
-    auto PL = types::getCompilationPhases(*this, Args, InputType);
-
     for (phases::ID Phase : PL) {
       if (Phase == phases::Link) {
         assert(Phase == PL.back() && "linking must be final compilation step.");
@@ -4173,6 +4182,14 @@
     }
   }
 
+  // We shouldn't embed the device action in the host if we are targeting a
+  // textual output format.
+  if (PL.back() != phases::Assemble && PL.back() != phases::Link) {
+    DeviceAction = C.MakeAction<OffloadAction>(
+        DDeps, DDeps.getActions().back()->getType());
+    return HostAction;
+  }
+
   OffloadAction::HostDependence HDep(
       *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
       /*BoundArch=*/nullptr, DDeps);
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -346,3 +346,10 @@
 // RUN:   | FileCheck -check-prefix=NEW_DRIVER_EMBEDDING %s
 
 // NEW_DRIVER_EMBEDDING: -fembed-offload-object=[[CUBIN:.*\.cubin]],openmp.nvptx64-nvidia-cuda.sm_70
+
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvida-cuda -march=sm_70 \
+// RUN:          --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-new-nvptx-test.bc \
+// RUN:          -fopenmp-new-driver -no-canonical-prefixes -S -emit-llvm -nogpulib %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=NEW_DRIVER_BUNDLING %s
+
+// NEW_DRIVER_BUNDLING: clang-offload-bundler" "-type=ll"