Index: include/clang/Driver/Action.h
===================================================================
--- include/clang/Driver/Action.h
+++ include/clang/Driver/Action.h
@@ -71,9 +71,10 @@
     VerifyPCHJobClass,
     OffloadBundlingJobClass,
     OffloadUnbundlingJobClass,
+    PartialLinkerJobClass,
 
     JobClassFirst = PreprocessJobClass,
-    JobClassLast = OffloadUnbundlingJobClass
+    JobClassLast = PartialLinkerJobClass
   };
 
   // The offloading kind determines if this action is binded to a particular
@@ -589,6 +590,18 @@
   }
 };
 
+class PartialLinkerJobAction : public JobAction {
+  void anchor() override;
+
+public:
+  // Partial linking does not change the type of output.
+  PartialLinkerJobAction(ActionList &Inputs);
+
+  static bool classof(const Action *A) {
+    return A->getKind() == PartialLinkerJobClass;
+  }
+};
+
 } // namespace driver
 } // namespace clang
 
Index: include/clang/Driver/Compilation.h
===================================================================
--- include/clang/Driver/Compilation.h
+++ include/clang/Driver/Compilation.h
@@ -122,6 +122,9 @@
   /// Whether an error during the parsing of the input args.
   bool ContainsError;
 
+  /// Whether the clang-offload-bundler can be skipped.
+  bool SkipOffloadBundler = false;
+
 public:
   Compilation(const Driver &D, const ToolChain &DefaultToolChain,
               llvm::opt::InputArgList *Args,
@@ -301,6 +304,16 @@
   /// of three. The inferior process's stdin(0), stdout(1), and stderr(2) will
   /// be redirected to the corresponding paths, if provided (not llvm::None).
   void Redirect(ArrayRef<Optional<StringRef>> Redirects);
+
+  /// Set whether the compilation can avoid calling the clang-offload-bundler
+  /// for object file types.
+  ///
+  /// \param skipBundler - bool value set once by the driver.
+  void setSkipOffloadBundler(bool skipBundler);
+
+  /// Returns true when calls to the clang-offload-bundler are not required
+  /// for object types.
+  bool canSkipOffloadBundler() const;
 };
 
 } // namespace driver
Index: include/clang/Driver/Driver.h
===================================================================
--- include/clang/Driver/Driver.h
+++ include/clang/Driver/Driver.h
@@ -256,7 +256,7 @@
   llvm::opt::DerivedArgList *
   TranslateInputArgs(const llvm::opt::InputArgList &Args) const;
 
-  // getFinalPhase - Determine which compilation mode we are in and record 
+  // getFinalPhase - Determine which compilation mode we are in and record
   // which option we used to determine the final phase.
   phases::ID getFinalPhase(const llvm::opt::DerivedArgList &DAL,
                            llvm::opt::Arg **FinalPhaseArg = nullptr) const;
@@ -363,12 +363,12 @@
   llvm::opt::InputArgList ParseArgStrings(ArrayRef<const char *> Args,
                                           bool &ContainsError);
 
-  /// BuildInputs - Construct the list of inputs and their types from 
+  /// BuildInputs - Construct the list of inputs and their types from
   /// the given arguments.
   ///
   /// \param TC - The default host tool chain.
   /// \param Args - The input arguments.
-  /// \param Inputs - The list to store the resulting compilation 
+  /// \param Inputs - The list to store the resulting compilation
   /// inputs onto.
   void BuildInputs(const ToolChain &TC, llvm::opt::DerivedArgList &Args,
                    InputList &Inputs) const;
@@ -491,7 +491,7 @@
   /// \param JA - The action of interest.
   /// \param BaseInput - The original input file that this action was
   /// triggered by.
-  /// \param BoundArch - The bound architecture. 
+  /// \param BoundArch - The bound architecture.
   /// \param AtTopLevel - Whether this is a "top-level" action.
   /// \param MultipleArchs - Whether multiple -arch options were supplied.
   /// \param NormalizedTriple - The normalized triple of the relevant target.
@@ -500,7 +500,7 @@
                                  bool AtTopLevel, bool MultipleArchs,
                                  StringRef NormalizedTriple) const;
 
-  /// GetTemporaryPath - Return the pathname of a temporary file to use 
+  /// GetTemporaryPath - Return the pathname of a temporary file to use
   /// as part of compilation; the file will have the given prefix and suffix.
   ///
   /// GCC goes to extra lengths here to be a bit more robust.
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -1486,6 +1486,10 @@
   Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
 def fno_openmp_cuda_mode : Flag<["-"], "fno-openmp-cuda-mode">, Group<f_Group>,
   Flags<[NoArgumentUnused, HelpHidden]>;
+def fopenmp_use_target_bundling : Flag<["-"], "fopenmp-use-target-bundling">, Group<f_Group>,
+  Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
+def fno_openmp_use_target_bundling : Flag<["-"], "fno-openmp-use-target-bundling">, Group<f_Group>,
+  Flags<[NoArgumentUnused, HelpHidden]>;
 def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
 def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group<f_Group>;
 def fno_escaping_block_tail_calls : Flag<["-"], "fno-escaping-block-tail-calls">, Group<f_Group>, Flags<[CC1Option]>;
Index: include/clang/Driver/ToolChain.h
===================================================================
--- include/clang/Driver/ToolChain.h
+++ include/clang/Driver/ToolChain.h
@@ -126,12 +126,14 @@
   mutable std::unique_ptr<Tool> Assemble;
   mutable std::unique_ptr<Tool> Link;
   mutable std::unique_ptr<Tool> OffloadBundler;
+  mutable std::unique_ptr<Tool> PartialLinker;
 
   Tool *getClang() const;
   Tool *getAssemble() const;
   Tool *getLink() const;
   Tool *getClangAs() const;
   Tool *getOffloadBundler() const;
+  Tool *getPartialLinker() const;
 
   mutable std::unique_ptr<SanitizerArgs> SanitizerArguments;
   mutable std::unique_ptr<XRayArgs> XRayArguments;
Index: lib/Driver/Action.cpp
===================================================================
--- lib/Driver/Action.cpp
+++ lib/Driver/Action.cpp
@@ -40,6 +40,8 @@
     return "clang-offload-bundler";
   case OffloadUnbundlingJobClass:
     return "clang-offload-unbundler";
+  case PartialLinkerJobClass:
+    return "partial-linker";
   }
 
   llvm_unreachable("invalid class");
@@ -388,3 +390,8 @@
 
 OffloadUnbundlingJobAction::OffloadUnbundlingJobAction(Action *Input)
     : JobAction(OffloadUnbundlingJobClass, Input, Input->getType()) {}
+
+void PartialLinkerJobAction::anchor() {}
+
+PartialLinkerJobAction::PartialLinkerJobAction(ActionList &Inputs)
+    : JobAction(PartialLinkerJobClass, Inputs, Inputs.front()->getType()) {}
Index: lib/Driver/Compilation.cpp
===================================================================
--- lib/Driver/Compilation.cpp
+++ lib/Driver/Compilation.cpp
@@ -271,3 +271,11 @@
 void Compilation::Redirect(ArrayRef<Optional<StringRef>> Redirects) {
   this->Redirects = Redirects;
 }
+
+void Compilation::setSkipOffloadBundler(bool skipBundler) {
+  SkipOffloadBundler = skipBundler;
+}
+
+bool Compilation::canSkipOffloadBundler() const {
+  return SkipOffloadBundler;
+}
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -2683,7 +2683,8 @@
   /// results will be kept in this action builder. Return true if an error was
   /// found.
   bool addHostDependenceToDeviceActions(Action *&HostAction,
-                                        const Arg *InputArg) {
+                                        const Arg *InputArg,
+                                        bool SkipBundler) {
     if (!IsValid)
       return true;
 
@@ -2695,7 +2696,8 @@
     // the input is not a bundle.
     if (CanUseBundler && isa<InputAction>(HostAction) &&
         InputArg->getOption().getKind() == llvm::opt::Option::InputClass &&
-        !types::isSrcFile(HostAction->getType())) {
+        !types::isSrcFile(HostAction->getType()) &&
+        !SkipBundler) {
       auto UnbundlingHostAction =
           C.MakeAction<OffloadUnbundlingJobAction>(HostAction);
       UnbundlingHostAction->registerDependentActionInfo(
@@ -2732,7 +2734,7 @@
   /// function can replace the host action by a bundling action if the
   /// programming models allow it.
   bool appendTopLevelActions(ActionList &AL, Action *HostAction,
-                             const Arg *InputArg) {
+                             const Arg *InputArg, bool usePartialLinkStep) {
     // Get the device actions to be appended.
     ActionList OffloadAL;
     for (auto *SB : SpecializedBuilders) {
@@ -2750,7 +2752,10 @@
       // We expect that the host action was just appended to the action list
       // before this method was called.
       assert(HostAction == AL.back() && "Host action not in the list??");
-      HostAction = C.MakeAction<OffloadBundlingJobAction>(OffloadAL);
+      if (usePartialLinkStep)
+        HostAction = C.MakeAction<PartialLinkerJobAction>(OffloadAL);
+      else
+        HostAction = C.MakeAction<OffloadBundlingJobAction>(OffloadAL);
       AL.back() = HostAction;
     } else
       AL.append(OffloadAL.begin(), OffloadAL.end());
@@ -2913,6 +2918,52 @@
     YcArg = YuArg = nullptr;
   }
 
+  // Determine whether the bundler tool can be skipped based on the set
+  // of triples provided to the -fopenmp-targets flag, if it is present.
+  bool CanSkipClangOffloadBundler = false;
+  if (!Args.hasArg(options::OPT_fopenmp_use_target_bundling)) {
+    if (Arg *OpenMPTargets = C.getInputArgs().getLastArg(
+            options::OPT_fopenmp_targets_EQ)) {
+      if (OpenMPTargets->getValues().size() > 0) {
+        unsigned triplesRequiringBundler = 0;
+        for (const char *Val : OpenMPTargets->getValues()) {
+          llvm::Triple TT(Val);
+
+          // If the list of tripled contains an invalid triple or
+          // contains a valid non-NVPTX triple then the bundler
+          // can be used.
+          if (TT.getArch() == llvm::Triple::UnknownArch ||
+              (TT.getArch() != llvm::Triple::UnknownArch &&
+               !TT.isNVPTX())) {
+            triplesRequiringBundler++;
+          }
+        }
+        CanSkipClangOffloadBundler = (triplesRequiringBundler == 0);
+        C.setSkipOffloadBundler(CanSkipClangOffloadBundler);
+      }
+    }
+  }
+
+  // Determine whether a linker which supports partial linking
+  // exists. On linux systems ld provides this functionality, there
+  // may be other linkers that work also.
+  // TODO: test if linker supports partial linking i.e. -r
+  // We know ld does so we will actually check if the linker
+  // is ld instead but this needs to be replaced.
+  bool CanDoPartialLinking = false;
+  if (CanSkipClangOffloadBundler &&
+      C.getInputArgs().hasArg(options::OPT_c)) {
+    // The bundler can be replaced with a partilal linking step
+    // only when outputing an object. For all other cases the
+    // fallback solution is the clang-offload-bundler.
+    StringRef LinkerName = C.getDefaultToolChain().GetLinkerPath();
+
+    // TODO: test if linker supports partial linking i.e. -r
+    // We know ld does so we will actually check if the linker
+    // is ld instead but this needs to be replaced.
+    CanDoPartialLinking = LinkerName.endswith("/ld");
+  }
+
   // Builder to be used to build offloading actions.
   OffloadingActionBuilder OffloadBuilder(C, Args, Inputs);
 
@@ -2988,7 +3039,13 @@
 
     // Use the current host action in any of the offloading actions, if
     // required.
-    if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg))
+    // The action may contain a bundling step which should not be executed
+    // if the toolchain we are targeting can produce object files that
+    // are understood by the host linker.
+    bool SkipBundler = (InputType == types::TY_Object) &&
+        CanSkipClangOffloadBundler;
+    if (OffloadBuilder.addHostDependenceToDeviceActions(
+            Current, InputArg, SkipBundler))
       break;
 
     for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
@@ -3024,7 +3081,8 @@
 
       // Use the current host action in any of the offloading actions, if
       // required.
-      if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg))
+      if (OffloadBuilder.addHostDependenceToDeviceActions(
+              Current, InputArg, SkipBundler))
         break;
 
       if (Current->getType() == types::TY_Nothing)
@@ -3036,7 +3094,8 @@
       Actions.push_back(Current);
 
     // Add any top level actions generated for offloading.
-    OffloadBuilder.appendTopLevelActions(Actions, Current, InputArg);
+    OffloadBuilder.appendTopLevelActions(Actions, Current, InputArg,
+        CanDoPartialLinking);
   }
 
   // Add a link action if necessary.
@@ -3586,6 +3645,7 @@
 
   InputInfoList OffloadDependencesInputInfo;
   bool BuildingForOffloadDevice = TargetDeviceOffloadKind != Action::OFK_None;
+
   if (const OffloadAction *OA = dyn_cast<OffloadAction>(A)) {
     // The 'Darwin' toolchain is initialized only when its arguments are
     // computed. Get the default arguments for OFK_None to ensure that
Index: lib/Driver/ToolChain.cpp
===================================================================
--- lib/Driver/ToolChain.cpp
+++ lib/Driver/ToolChain.cpp
@@ -272,6 +272,12 @@
   return OffloadBundler.get();
 }
 
+Tool *ToolChain::getPartialLinker() const {
+  if (!PartialLinker)
+    PartialLinker.reset(new tools::PartialLinker(*this));
+  return PartialLinker.get();
+}
+
 Tool *ToolChain::getTool(Action::ActionClass AC) const {
   switch (AC) {
   case Action::AssembleJobClass:
@@ -300,6 +306,10 @@
   case Action::OffloadBundlingJobClass:
   case Action::OffloadUnbundlingJobClass:
     return getOffloadBundler();
+
+  case Action::PartialLinkerJobClass:
+    return getPartialLinker();
+
   }
 
   llvm_unreachable("Invalid tool kind.");
@@ -553,7 +563,7 @@
     StringRef Suffix =
       tools::arm::getLLVMArchSuffixForARM(CPU, MArch, Triple);
     bool IsMProfile = ARM::parseArchProfile(Suffix) == ARM::ProfileKind::M;
-    bool ThumbDefault = IsMProfile || (ARM::parseArchVersion(Suffix) == 7 && 
+    bool ThumbDefault = IsMProfile || (ARM::parseArchVersion(Suffix) == 7 &&
                                        getTriple().isOSBinFormatMachO());
     // FIXME: this is invalid for WindowsCE
     if (getTriple().isOSWindows())
Index: lib/Driver/ToolChains/Clang.h
===================================================================
--- lib/Driver/ToolChains/Clang.h
+++ lib/Driver/ToolChains/Clang.h
@@ -147,6 +147,19 @@
                                    const llvm::opt::ArgList &TCArgs,
                                    const char *LinkingOutput) const override;
 };
+
+/// Partial linker tool.
+class LLVM_LIBRARY_VISIBILITY PartialLinker final : public Tool {
+public:
+  PartialLinker(const ToolChain &TC)
+      : Tool("PartialLinker", "partial-linker", TC) {}
+
+  bool hasIntegratedCPP() const override { return false; }
+  void ConstructJob(Compilation &C, const JobAction &JA,
+                    const InputInfo &Output, const InputInfoList &Inputs,
+                    const llvm::opt::ArgList &TCArgs,
+                    const char *LinkingOutput) const override;
+};
 } // end namespace tools
 
 } // end namespace driver
Index: lib/Driver/ToolChains/Clang.cpp
===================================================================
--- lib/Driver/ToolChains/Clang.cpp
+++ lib/Driver/ToolChains/Clang.cpp
@@ -5478,6 +5478,49 @@
   C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
 }
 
+// Begin partial linking
+
+void PartialLinker::ConstructJob(Compilation &C, const JobAction &JA,
+                                 const InputInfo &Output,
+                                 const InputInfoList &Inputs,
+                                 const llvm::opt::ArgList &TCArgs,
+                                 const char *LinkingOutput) const {
+  // The version with only one output is expected to refer to a bundling job.
+  assert(isa<PartialLinkerJobAction>(JA) && "Expecting partial linking job!");
+
+  // The partial linking command line (using ld as example):
+  // ld -r input1.o input2.o -o single-file.o
+  ArgStringList CmdArgs;
+
+  // Ensure conditions are met for doing partial linking instead of bundling.
+  assert(TCArgs.hasArg(options::OPT_c) &&
+      "Can only use partial linking for object file generation.");
+  assert(C.canSkipOffloadBundler() &&
+      "Offload bundler cannot be skipped.");
+
+  // TODO: the assert may be removed once a more elaborate checking is in
+  // place in the Driver.
+  StringRef LinkerName = getToolChain().GetLinkerPath();
+  assert(LinkerName.endswith("/ld") && "Partial linking not supported.");
+
+  // Enable partial linking.
+  CmdArgs.push_back(TCArgs.MakeArgString("-r"));
+
+  // Add input files.
+  for (unsigned I = 0; I < Inputs.size(); ++I) {
+    CmdArgs.push_back(TCArgs.MakeArgString(Inputs[I].getFilename()));
+  }
+
+  // Add output file.
+  CmdArgs.push_back(TCArgs.MakeArgString("-o"));
+  CmdArgs.push_back(TCArgs.MakeArgString(Output.getFilename()));
+
+  // Add partial linker command.
+  C.addCommand(llvm::make_unique<Command>(
+      JA, *this, TCArgs.MakeArgString(getToolChain().GetLinkerPath()),
+      CmdArgs, None));
+}
+
 // Begin OffloadBundler
 
 void OffloadBundler::ConstructJob(Compilation &C, const JobAction &JA,
Index: lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -382,7 +382,8 @@
   CmdArgs.push_back("--gpu-name");
   CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
   CmdArgs.push_back("--output-file");
-  CmdArgs.push_back(Args.MakeArgString(TC.getInputFilename(Output)));
+  const char *CubinF = Args.MakeArgString(TC.getInputFilename(Output));
+  CmdArgs.push_back(CubinF);
   for (const auto& II : Inputs)
     CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
 
@@ -408,6 +409,130 @@
   else
     Exec = Args.MakeArgString(TC.GetProgramPath("ptxas"));
   C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+
+  // For OpenMP targets offloaded to an NVIDIA device offloading, call the
+  // NVIDIA tools that make the object file discoverable by NVLINK.
+  // Wrap the resulting fatbinary file into a host-friendly object file to
+  // be linked with the host object file.
+  if (JA.isDeviceOffloading(Action::OFK_OpenMP) &&
+      Args.hasArg(options::OPT_c) &&
+      C.canSkipOffloadBundler()) {
+    ArgStringList FatbinaryCmdArgs;
+    FatbinaryCmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32");
+
+    ArgStringList CompilerCmdArgs;
+    CompilerCmdArgs.push_back(Args.MakeArgString("-c"));
+    CompilerCmdArgs.push_back(Args.MakeArgString("-o"));
+    CompilerCmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+    CompilerCmdArgs.push_back(Args.MakeArgString(llvm::Twine("-I") +
+        TC.CudaInstallation.getBinPath() + llvm::Twine("/../include")));
+
+    // Create fatbin file using fatbinary executable.
+    SmallString<128> OrigOutputFileName =
+        llvm::sys::path::filename(Output.getFilename());
+
+    // Create fatbin file.
+    const char *FatbinF;
+    if (C.getDriver().isSaveTempsEnabled()) {
+      llvm::sys::path::replace_extension(OrigOutputFileName, "fatbin");
+      FatbinF = C.getArgs().MakeArgString(OrigOutputFileName.c_str());
+    } else {
+      llvm::sys::path::replace_extension(OrigOutputFileName, "");
+      OrigOutputFileName =
+          C.getDriver().GetTemporaryPath(OrigOutputFileName, "fatbin");
+      FatbinF =
+          C.addTempFile(C.getArgs().MakeArgString(OrigOutputFileName.c_str()));
+    }
+    FatbinaryCmdArgs.push_back(
+        Args.MakeArgString(llvm::Twine("--create=") + FatbinF));
+
+    // Create fatbin file wrapper using fatbinary executable.
+    const char *WrappedFatbinF;
+    llvm::sys::path::replace_extension(OrigOutputFileName, "fatbin.c");
+    if (C.getDriver().isSaveTempsEnabled())
+      WrappedFatbinF = C.getArgs().MakeArgString(OrigOutputFileName);
+    else
+      WrappedFatbinF =
+          C.addTempFile(C.getArgs().MakeArgString(OrigOutputFileName));
+    FatbinaryCmdArgs.push_back(
+        Args.MakeArgString(llvm::Twine("--embedded-fatbin=") +
+                           WrappedFatbinF));
+
+    // Continue assembling the host compiler arguments.
+    CompilerCmdArgs.push_back(Args.MakeArgString(WrappedFatbinF));
+
+    StringRef GPUArch = Args.getLastArgValue(options::OPT_march_EQ);
+    assert(!GPUArch.empty() && "At least one GPU Arch required for nvlink.");
+
+    for (const auto& II : Inputs) {
+      SmallString<128> OrigInputFileName =
+          llvm::sys::path::filename(II.getFilename());
+
+      if (II.getType() == types::TY_LLVM_IR ||
+          II.getType() == types::TY_LTO_IR ||
+          II.getType() == types::TY_LTO_BC ||
+          II.getType() == types::TY_LLVM_BC) {
+        C.getDriver().Diag(diag::err_drv_no_linker_llvm_support)
+            << getToolChain().getTripleString();
+        continue;
+      }
+
+      // Currently, we only pass the input files to the linker, we do not pass
+      // any libraries that may be valid only for the host. Any static
+      // libraries will be handled at the link stage.
+      if (!II.isFilename() || OrigInputFileName.endswith(".a"))
+        continue;
+
+      auto *A = II.getAction();
+      assert(A->getInputs().size() == 1 &&
+             "Device offload action is expected to have a single input");
+      CudaArch gpu_arch = StringToCudaArch(GPUArch);
+
+      // We need to pass an Arch of the form "sm_XX" for cubin files and
+      // "compute_XX" for ptx.
+      const char *Arch =
+          (II.getType() == types::TY_PP_Asm)
+              ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch))
+              : GPUArch.str().c_str();
+      const char *PtxF =
+          C.addTempFile(C.getArgs().MakeArgString(II.getFilename()));
+      FatbinaryCmdArgs.push_back("--cmdline=--compile-only");
+      FatbinaryCmdArgs.push_back(
+          Args.MakeArgString(llvm::Twine("--image=profile=") +
+                             Arch + ",file=" + PtxF));
+      FatbinaryCmdArgs.push_back(
+          Args.MakeArgString(llvm::Twine("--image=profile=") +
+              GPUArch.str().c_str() + "@" + Arch + ",file=" + CubinF));
+    }
+
+    FatbinaryCmdArgs.push_back(Args.MakeArgString("--cuda"));
+    FatbinaryCmdArgs.push_back(Args.MakeArgString("--device-c"));
+
+    for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))
+      FatbinaryCmdArgs.push_back(Args.MakeArgString(A));
+
+    // fatbinary --create=ompprint.fatbin -64
+    //   --image=profile=compute_35,file=ompprint.compute_35.ptx
+    //   --image=profile=sm_35@compute_35,file=ompprint.compute_35.sm_35.cubin
+    //   --embedded-fatbin=ompprint.fatbin.c --cuda --device-c
+    const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
+    C.addCommand(llvm::make_unique<Command>(
+        JA, *this, Exec, FatbinaryCmdArgs, Inputs));
+
+    // Come up with a unique name for the fatbin segment. The name uses
+    // the hash of the full path of the file.
+    std::hash<std::string> hash_fn;
+    size_t hash = hash_fn(llvm::sys::path::filename(Output.getFilename()));
+    CompilerCmdArgs.push_back(
+        Args.MakeArgString(llvm::Twine("-D__NV_MODULE_ID=") +
+            llvm::Twine(hash)));
+
+    // clang++ -c ompprint.fatbin.c -I/path/to/cuda/include/dir
+    const char *CompilerExec =
+        Args.MakeArgString(TC.GetProgramPath("clang++"));
+    C.addCommand(llvm::make_unique<Command>(
+        JA, *this, CompilerExec, CompilerCmdArgs, Inputs));
+  }
 }
 
 static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) {
@@ -512,6 +637,9 @@
   // Add paths specified in LIBRARY_PATH environment variable as -L options.
   addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
 
+  if (C.canSkipOffloadBundler())
+    Args.AddAllArgs(CmdArgs, options::OPT_L);
+
   // Add paths for the default clang library path.
   SmallString<256> DefaultLibPath =
       llvm::sys::path::parent_path(TC.getDriver().Dir);
@@ -531,15 +659,37 @@
       continue;
     }
 
-    // Currently, we only pass the input files to the linker, we do not pass
-    // any libraries that may be valid only for the host.
-    if (!II.isFilename())
+    if (!II.isFilename()) {
+      // Anything that's not a file name is potentially a static library
+      // so treat it as such.
+      if (C.canSkipOffloadBundler())
+        CmdArgs.push_back(C.getArgs().MakeArgString(llvm::Twine("-l") +
+              II.getInputArg().getValue()));
       continue;
+    }
 
-    const char *CubinF = C.addTempFile(
-        C.getArgs().MakeArgString(getToolChain().getInputFilename(II)));
-
-    CmdArgs.push_back(CubinF);
+    StringRef OrigInputFileName =
+        llvm::sys::path::filename(II.getBaseInput());
+    if (OrigInputFileName.endswith(".a")) {
+      const char *StaticLibName =
+          C.getArgs().MakeArgString(II.getFilename());
+      CmdArgs.push_back(StaticLibName);
+    } else {
+      // If the original input is not an object file then it means the
+      // assembly step has actually produced a cubin so we need to
+      // rename it accordingly.
+      if ((!C.canSkipOffloadBundler() && OrigInputFileName.endswith(".o")) ||
+          (C.canSkipOffloadBundler() && !OrigInputFileName.endswith(".o"))) {
+        // Create cubin file name and add it as a temporary file.
+        SmallString<256> Filename(II.getFilename());
+        llvm::sys::path::replace_extension(Filename, "cubin");
+        const char *CubinF = C.addTempFile(
+            C.getArgs().MakeArgString(Filename.str()));
+        CmdArgs.push_back(CubinF);
+      } else {
+        CmdArgs.push_back(II.getFilename());
+      }
+    }
   }
 
   AddOpenMPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA);
Index: test/Driver/openmp-offload-gpu-linux.c
===================================================================
--- /dev/null
+++ test/Driver/openmp-offload-gpu-linux.c
@@ -0,0 +1,52 @@
+///
+/// Perform driver tests for OpenMP offloading on Linux systems
+///
+
+// UNSUPPORTED: system-windows
+
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+/// Check cubin file generation and partial linking with ld
+// RUN:   %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:          -no-canonical-prefixes -save-temps %s -c 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-PTXAS-CUBIN-BUNDLING %s
+
+// CHK-PTXAS-CUBIN-BUNDLING: clang{{.*}}" "-o" "[[PTX:.*\.s]]"
+// CHK-PTXAS-CUBIN-BUNDLING-NEXT: ptxas{{.*}}" "--output-file" "[[CUBIN:.*\.cubin]]" {{.*}}"[[PTX]]"
+// CHK-PTXAS-CUBIN-BUNDLING: fatbinary{{.*}}" "--create=[[FATBIN:.*\.fatbin]]" "
+// CHK-PTXAS-CUBIN-BUNDLING-SAME: --embedded-fatbin=[[FATBINC:.*\.fatbin.c]]" "
+// CHK-PTXAS-CUBIN-BUNDLING-SAME: --cmdline=--compile-only" "--image=profile={{.*}}[[PTX]]" "
+// CHK-PTXAS-CUBIN-BUNDLING-SAME: --image=profile={{.*}}file=[[CUBIN]]" "--cuda" "--device-c"
+// CHK-PTXAS-CUBIN-BUNDLING: clang++{{.*}}" "-c" "-o" "[[HOSTDEV:.*\.o]]"{{.*}}" "[[FATBINC]]" "-D__NV_MODULE_ID=
+// CHK-PTXAS-CUBIN-BUNDLING-NOT: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-inputs={{.*}}[[CUBIN]]
+// CHK-PTXAS-CUBIN-BUNDLING: ld" "-r" "[[HOSTDEV]]" "{{.*}}.o" "-o" "{{.*}}.o"
+
+/// ###########################################################################
+
+/// Check object file unbundling is not happening when skipping bundler
+// RUN:   touch %t.o
+// RUN:   %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:          -no-canonical-prefixes -save-temps %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s
+
+/// Use DAG to ensure that object file has not been unbundled.
+// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: nvlink{{.*}}" {{.*}}"[[OBJ:.*\.o]]"
+// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: ld{{.*}}" {{.*}}"[[OBJ]]"
+
+/// ###########################################################################
+
+/// Check object file generation is not happening when skipping bundler
+// RUN:   touch %t1.o
+// RUN:   touch %t2.o
+// RUN:   %clang -### -no-canonical-prefixes -target powerpc64le-unknown-linux-gnu -fopenmp=libomp \
+// RUN:          -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-TWOCUBIN %s
+/// Check cubin file generation and usage by nvlink when toolchain has BindArchAction
+// RUN:   %clang -### -no-canonical-prefixes -target x86_64-apple-darwin17.0.0 -fopenmp=libomp \
+// RUN:          -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-TWOCUBIN %s
+
+// CHK-TWOCUBIN: nvlink{{.*}}openmp-offload-{{.*}}.o" "{{.*}}openmp-offload-{{.*}}.o"
Index: test/Driver/openmp-offload-gpu.c
===================================================================
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -61,7 +61,7 @@
 
 /// Check cubin file generation and bundling
 // RUN:   %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
-// RUN:          -no-canonical-prefixes -save-temps %s -c 2>&1 \
+// RUN:          -no-canonical-prefixes -save-temps %s -c -fopenmp-use-target-bundling 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-PTXAS-CUBIN-BUNDLING %s
 
 // CHK-PTXAS-CUBIN-BUNDLING: clang{{.*}}" "-o" "[[PTX:.*\.s]]"
@@ -73,7 +73,7 @@
 /// Check cubin file unbundling and usage by nvlink
 // RUN:   touch %t.o
 // RUN:   %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
-// RUN:          -no-canonical-prefixes -save-temps %t.o 2>&1 \
+// RUN:          -no-canonical-prefixes -save-temps %t.o -fopenmp-use-target-bundling 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s
 
 /// Use DAG to ensure that cubin file has been unbundled.
@@ -87,11 +87,11 @@
 // RUN:   touch %t1.o
 // RUN:   touch %t2.o
 // RUN:   %clang -### -no-canonical-prefixes -target powerpc64le-unknown-linux-gnu -fopenmp=libomp \
-// RUN:          -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \
+// RUN:          -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o -fopenmp-use-target-bundling 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-TWOCUBIN %s
 /// Check cubin file generation and usage by nvlink when toolchain has BindArchAction
 // RUN:   %clang -### -no-canonical-prefixes -target x86_64-apple-darwin17.0.0 -fopenmp=libomp \
-// RUN:          -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \
+// RUN:          -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o -fopenmp-use-target-bundling 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-TWOCUBIN %s
 
 // CHK-TWOCUBIN: nvlink{{.*}}openmp-offload-{{.*}}.cubin" "{{.*}}openmp-offload-{{.*}}.cubin"
Index: test/Driver/openmp-offload.c
===================================================================
--- test/Driver/openmp-offload.c
+++ test/Driver/openmp-offload.c
@@ -480,13 +480,13 @@
 // Create host object and bundle.
 // CHK-BUJOBS: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-emit-obj" {{.*}}"-fopenmp" {{.*}}"-o" "
 // CHK-BUJOBS-SAME: [[HOSTOBJ:[^\\/]+\.o]]" "-x" "ir" "{{.*}}[[HOSTBC]]"
-// CHK-BUJOBS: clang-offload-bundler{{.*}}" "-type=o" "-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs=
+// CHK-BUJOBS: clang-offload-bundler{{.*}}" "-type=o"{{.*}}"-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs=
 // CHK-BUJOBS-SAME: [[RES:[^\\/]+\.o]]" "-inputs={{.*}}[[T1OBJ]],{{.*}}[[T2OBJ]],{{.*}}[[HOSTOBJ]]"
 // CHK-BUJOBS-ST: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-S" {{.*}}"-fopenmp" {{.*}}"-o" "
 // CHK-BUJOBS-ST-SAME: [[HOSTASM:[^\\/]+\.s]]" "-x" "ir" "{{.*}}[[HOSTBC]]"
 // CHK-BUJOBS-ST: clang{{.*}}" "-cc1as" "-triple" "powerpc64le--linux" "-filetype" "obj" {{.*}}"-o" "
 // CHK-BUJOBS-ST-SAME: [[HOSTOBJ:[^\\/]+\.o]]" "{{.*}}[[HOSTASM]]"
-// CHK-BUJOBS-ST: clang-offload-bundler{{.*}}" "-type=o" "-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs=
+// CHK-BUJOBS-ST: clang-offload-bundler{{.*}}" "-type=o"{{.*}}"-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs=
 // CHK-BUJOBS-ST-SAME: [[RES:[^\\/]+\.o]]" "-inputs={{.*}}[[T1OBJ]],{{.*}}[[T2OBJ]],{{.*}}[[HOSTOBJ]]"
 
 /// ###########################################################################