Index: include/clang/Basic/DiagnosticDriverKinds.td
===================================================================
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -69,6 +69,8 @@
   "invalid Xarch argument: '%0', options requiring arguments are unsupported">;
 def err_drv_invalid_Xarch_argument_isdriver : Error<
   "invalid Xarch argument: '%0', cannot change driver behavior inside Xarch argument">;
+def err_drv_Xopenmp_target_missing_triple : Error<
+  "cannot deduce implicit triple value for -Xopenmp-target, specify triple using -Xopenmp-target=<triple>">;
 def err_drv_argument_only_allowed_with : Error<
   "invalid argument '%0' only allowed with '%1'">;
 def err_drv_argument_not_allowed_with : Error<
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -459,6 +459,10 @@
   HelpText<"Pass <arg> to fatbinary invocation">, MetaVarName<"<arg>">;
 def Xcuda_ptxas : Separate<["-"], "Xcuda-ptxas">,
   HelpText<"Pass <arg> to the ptxas assembler">, MetaVarName<"<arg>">;
+def Xopenmp_target : Separate<["-"], "Xopenmp-target">,
+  HelpText<"Pass <arg> to the target offloading toolchain.">, MetaVarName<"<arg>">;
+def Xopenmp_target_EQ : JoinedAndSeparate<["-"], "Xopenmp-target=">,
+  HelpText<"Pass <arg> to the specified target offloading toolchain. The triple that identifies the toolchain must be provided after the equals sign.">, MetaVarName<"<arg>">;
 def z : Separate<["-"], "z">, Flags<[LinkerInput, RenderAsInput]>,
   HelpText<"Pass -z <arg> to the linker">, MetaVarName<"<arg>">,
   Group<Link_Group>;
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -525,7 +525,7 @@
     auto &CudaTC = ToolChains[CudaTriple.str() + "/" + HostTriple.str()];
     if (!CudaTC) {
       CudaTC = llvm::make_unique<toolchains::CudaToolChain>(
-          *this, CudaTriple, *HostTC, C.getInputArgs());
+          *this, CudaTriple, *HostTC, C.getInputArgs(), Action::OFK_Cuda);
     }
     C.addOffloadDeviceToolChain(CudaTC.get(), Action::OFK_Cuda);
   }
@@ -583,7 +583,7 @@
                   ToolChains[TT.str() + "/" + HostTC->getTriple().normalize()];
               if (!CudaTC)
                 CudaTC = llvm::make_unique<toolchains::CudaToolChain>(
-                    *this, TT, *HostTC, C.getInputArgs());
+                    *this, TT, *HostTC, C.getInputArgs(), Action::OFK_OpenMP);
               TC = CudaTC.get();
             } else
               TC = &getToolChain(C.getInputArgs(), TT);
Index: lib/Driver/ToolChains/Clang.cpp
===================================================================
--- lib/Driver/ToolChains/Clang.cpp
+++ lib/Driver/ToolChains/Clang.cpp
@@ -5253,7 +5253,13 @@
   for (unsigned I = 0; I < Outputs.size(); ++I) {
     if (I)
       UB += ',';
-    UB += Outputs[I].getFilename();
+    SmallString<256> OutputFileName(Outputs[I].getFilename());
+    // Change extension of target files for OpenMP offloading
+    // to NVIDIA GPUs.
+    if (DepInfo[I].DependentToolChain->getTriple().isNVPTX() &&
+        JA.isOffloading(Action::OFK_OpenMP))
+      llvm::sys::path::replace_extension(OutputFileName, "cubin");
+    UB += OutputFileName;
   }
   CmdArgs.push_back(TCArgs.MakeArgString(UB));
   CmdArgs.push_back("-unbundle");
Index: lib/Driver/ToolChains/CommonArgs.h
===================================================================
--- lib/Driver/ToolChains/CommonArgs.h
+++ lib/Driver/ToolChains/CommonArgs.h
@@ -39,6 +39,13 @@
                     llvm::opt::ArgStringList &CmdArgs,
                     const llvm::opt::ArgList &Args);
 
+void AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
+                           const InputInfo &Output,
+                           const InputInfoList &Inputs,
+                           const llvm::opt::ArgList &Args,
+                           llvm::opt::ArgStringList &CmdArgs,
+                           const JobAction &JA);
+
 const char *SplitDebugName(const llvm::opt::ArgList &Args,
                            const InputInfo &Input);
 
Index: lib/Driver/ToolChains/CommonArgs.cpp
===================================================================
--- lib/Driver/ToolChains/CommonArgs.cpp
+++ lib/Driver/ToolChains/CommonArgs.cpp
@@ -1023,3 +1023,128 @@
     break;
   }
 }
+
+/// Add OpenMP linker script arguments at the end of the argument list so that
+/// the fat binary is built by embedding each of the device images into the
+/// host. The linker script also defines a few symbols required by the code
+/// generation so that the images can be easily retrieved at runtime by the
+/// offloading library. This should be used only in tool chains that support
+/// linker scripts.
+void tools::AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
+                                  const InputInfo &Output,
+                                  const InputInfoList &Inputs,
+                                  const ArgList &Args, ArgStringList &CmdArgs,
+                                  const JobAction &JA) {
+
+  // If this is not an OpenMP host toolchain, we don't need to do anything.
+  if (!JA.isHostOffloading(Action::OFK_OpenMP))
+    return;
+
+  // Create temporary linker script. Keep it if save-temps is enabled.
+  const char *LKS;
+  SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
+  if (C.getDriver().isSaveTempsEnabled()) {
+    llvm::sys::path::replace_extension(Name, "lk");
+    LKS = C.getArgs().MakeArgString(Name.c_str());
+  } else {
+    llvm::sys::path::replace_extension(Name, "");
+    Name = C.getDriver().GetTemporaryPath(Name, "lk");
+    LKS = C.addTempFile(C.getArgs().MakeArgString(Name.c_str()));
+  }
+
+  // Add linker script option to the command.
+  CmdArgs.push_back("-T");
+  CmdArgs.push_back(LKS);
+
+  // Create a buffer to write the contents of the linker script.
+  std::string LksBuffer;
+  llvm::raw_string_ostream LksStream(LksBuffer);
+
+  // Get the OpenMP offload tool chains so that we can extract the triple
+  // associated with each device input.
+  auto OpenMPToolChains = C.getOffloadToolChains<Action::OFK_OpenMP>();
+  assert(OpenMPToolChains.first != OpenMPToolChains.second &&
+         "No OpenMP toolchains??");
+
+  // Track the input file name and device triple in order to build the script,
+  // inserting binaries in the designated sections.
+  SmallVector<std::pair<std::string, const char *>, 8> InputBinaryInfo;
+
+  // Add commands to embed target binaries. We ensure that each section and
+  // image is 16-byte aligned. This is not mandatory, but increases the
+  // likelihood of data to be aligned with a cache block in several main host
+  // machines.
+  LksStream << "/*\n";
+  LksStream << "       OpenMP Offload Linker Script\n";
+  LksStream << " *** Automatically generated by Clang ***\n";
+  LksStream << "*/\n";
+  LksStream << "TARGET(binary)\n";
+  auto DTC = OpenMPToolChains.first;
+  for (auto &II : Inputs) {
+    const Action *A = II.getAction();
+    // Is this a device linking action?
+    if (A && isa<LinkJobAction>(A) &&
+        A->isDeviceOffloading(Action::OFK_OpenMP)) {
+      assert(DTC != OpenMPToolChains.second &&
+             "More device inputs than device toolchains??");
+      InputBinaryInfo.push_back(std::make_pair(
+          DTC->second->getTriple().normalize(), II.getFilename()));
+      ++DTC;
+      LksStream << "INPUT(" << II.getFilename() << ")\n";
+    }
+  }
+
+  assert(DTC == OpenMPToolChains.second &&
+         "Less device inputs than device toolchains??");
+
+  LksStream << "SECTIONS\n";
+  LksStream << "{\n";
+
+  // Put each target binary into a separate section.
+  for (const auto &BI : InputBinaryInfo) {
+    LksStream << "  .omp_offloading." << BI.first << " :\n";
+    LksStream << "  ALIGN(0x10)\n";
+    LksStream << "  {\n";
+    LksStream << "    PROVIDE_HIDDEN(.omp_offloading.img_start." << BI.first
+              << " = .);\n";
+    LksStream << "    " << BI.second << "\n";
+    LksStream << "    PROVIDE_HIDDEN(.omp_offloading.img_end." << BI.first
+              << " = .);\n";
+    LksStream << "  }\n";
+  }
+
+  // Add commands to define host entries begin and end. We use 1-byte subalign
+  // so that the linker does not add any padding and the elements in this
+  // section form an array.
+  LksStream << "  .omp_offloading.entries :\n";
+  LksStream << "  ALIGN(0x10)\n";
+  LksStream << "  SUBALIGN(0x01)\n";
+  LksStream << "  {\n";
+  LksStream << "    PROVIDE_HIDDEN(.omp_offloading.entries_begin = .);\n";
+  LksStream << "    *(.omp_offloading.entries)\n";
+  LksStream << "    PROVIDE_HIDDEN(.omp_offloading.entries_end = .);\n";
+  LksStream << "  }\n";
+  LksStream << "}\n";
+  LksStream << "INSERT BEFORE .data\n";
+  LksStream.flush();
+
+  // Dump the contents of the linker script if the user requested that. We
+  // support this option to enable testing of behavior with -###.
+  if (C.getArgs().hasArg(options::OPT_fopenmp_dump_offload_linker_script))
+    llvm::errs() << LksBuffer;
+
+  // If this is a dry run, do not create the linker script file.
+  if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
+    return;
+
+  // Open script file and write the contents.
+  std::error_code EC;
+  llvm::raw_fd_ostream Lksf(LKS, EC, llvm::sys::fs::F_None);
+
+  if (EC) {
+    C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
+    return;
+  }
+
+  Lksf << LksBuffer;
+}
Index: lib/Driver/ToolChains/Cuda.h
===================================================================
--- lib/Driver/ToolChains/Cuda.h
+++ lib/Driver/ToolChains/Cuda.h
@@ -112,6 +112,20 @@
                      const char *LinkingOutput) const override;
 };
 
+class LLVM_LIBRARY_VISIBILITY OpenMPLinker : public Tool {
+ public:
+   OpenMPLinker(const ToolChain &TC)
+       : Tool("NVPTX::OpenMPLinker", "fatbinary", TC, RF_Full, llvm::sys::WEM_UTF8,
+              "--options-file") {}
+
+   bool hasIntegratedCPP() const override { return false; }
+
+   void ConstructJob(Compilation &C, const JobAction &JA,
+                     const InputInfo &Output, const InputInfoList &Inputs,
+                     const llvm::opt::ArgList &TCArgs,
+                     const char *LinkingOutput) const override;
+};
+
 } // end namespace NVPTX
 } // end namespace tools
 
@@ -120,7 +134,8 @@
 class LLVM_LIBRARY_VISIBILITY CudaToolChain : public ToolChain {
 public:
   CudaToolChain(const Driver &D, const llvm::Triple &Triple,
-                const ToolChain &HostTC, const llvm::opt::ArgList &Args);
+                const ToolChain &HostTC, const llvm::opt::ArgList &Args,
+                const Action::OffloadKind OK);
 
   virtual const llvm::Triple *getAuxTriple() const override {
     return &HostTC.getTriple();
@@ -169,6 +184,9 @@
 protected:
   Tool *buildAssembler() const override;  // ptxas
   Tool *buildLinker() const override;     // fatbinary (ok, not really a linker)
+
+private:
+  const Action::OffloadKind OK;
 };
 
 } // end namespace toolchains
Index: lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -9,7 +9,9 @@
 
 #include "Cuda.h"
 #include "InputInfo.h"
+#include "CommonArgs.h"
 #include "clang/Basic/Cuda.h"
+#include "clang/Config/config.h"
 #include "clang/Basic/VirtualFileSystem.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
@@ -212,8 +214,20 @@
       static_cast<const toolchains::CudaToolChain &>(getToolChain());
   assert(TC.getTriple().isNVPTX() && "Wrong platform");
 
+  StringRef GPUArchName;
+  std::vector<std::string> GPUArchNames;
+  // If this is an OpenMP action we need to extract the device architecture from
+  // the -fopenmp-target-arch option.
+  if (JA.isDeviceOffloading(Action::OFK_OpenMP)) {
+    GPUArchNames = Args.getAllArgValues(options::OPT_march_EQ);
+    assert(GPUArchNames.size() == 1 &&
+           "Exactly one GPU Arch required for ptxas.");
+    GPUArchName = GPUArchNames[0];
+  } else
+    GPUArchName = JA.getOffloadingArch();
+
   // Obtain architecture from the action.
-  CudaArch gpu_arch = StringToCudaArch(JA.getOffloadingArch());
+  CudaArch gpu_arch = StringToCudaArch(GPUArchName);
   assert(gpu_arch != CudaArch::UNKNOWN &&
          "Device action expected to have an architecture.");
 
@@ -265,7 +279,10 @@
   CmdArgs.push_back("--gpu-name");
   CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
   CmdArgs.push_back("--output-file");
-  CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+  SmallString<256> OutputFileName(Output.getFilename());
+  if (JA.isOffloading(Action::OFK_OpenMP))
+    llvm::sys::path::replace_extension(OutputFileName, "cubin");
+  CmdArgs.push_back(Args.MakeArgString(OutputFileName));
   for (const auto& II : Inputs)
     CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
 
@@ -324,14 +341,93 @@
   C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
 }
 
+void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA,
+                                       const InputInfo &Output,
+                                       const InputInfoList &Inputs,
+                                       const ArgList &Args,
+                                       const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getTriple().isNVPTX() && "Wrong platform");
+
+  ArgStringList CmdArgs;
+
+  // OpenMP uses nvlink to link cubin files. The result will be embedded in the
+  // host binary by the host linker.
+  assert(!JA.isHostOffloading(Action::OFK_OpenMP) &&
+         "CUDA toolchain not expected for an OpenMP host device.");
+
+  if (Output.isFilename()) {
+    CmdArgs.push_back("-o");
+    CmdArgs.push_back(Output.getFilename());
+  } else
+    assert(Output.isNothing() && "Invalid output.");
+  if (Args.hasArg(options::OPT_g_Flag))
+    CmdArgs.push_back("-g");
+
+  if (Args.hasArg(options::OPT_v))
+    CmdArgs.push_back("-v");
+
+  std::vector<std::string> GPUArchs =
+      Args.getAllArgValues(options::OPT_march_EQ);
+  assert(GPUArchs.size() == 1 && "Exactly one GPU Arch required for ptxas.");
+  const std::string &GPUArch = GPUArchs[0];
+
+  CmdArgs.push_back("-arch");
+  CmdArgs.push_back(Args.MakeArgString(GPUArch));
+
+  // Add paths specified in LIBRARY_PATH environment variable as -L options.
+  addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
+
+  // Add paths for the default clang library path.
+  SmallString<256> DefaultLibPath =
+      llvm::sys::path::parent_path(TC.getDriver().Dir);
+  llvm::sys::path::append(DefaultLibPath, "lib" CLANG_LIBDIR_SUFFIX);
+  CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
+
+  // Add linking against library implementing OpenMP calls on NVPTX target.
+  CmdArgs.push_back("-lomptarget-nvptx");
+
+  for (const auto &II : Inputs) {
+    if (II.getType() == types::TY_LLVM_IR ||
+        II.getType() == types::TY_LTO_IR ||
+        II.getType() == types::TY_LTO_BC ||
+        II.getType() == types::TY_LLVM_BC) {
+      C.getDriver().Diag(diag::err_drv_no_linker_llvm_support)
+          << getToolChain().getTripleString();
+      continue;
+    }
+
+    // Currently, we only pass the input files to the linker, we do not pass
+    // any libraries that may be valid only for the host.
+    if (!II.isFilename())
+      continue;
+
+    SmallString<256> Name = llvm::sys::path::filename(II.getFilename());
+    llvm::sys::path::replace_extension(Name, "cubin");
+
+    const char *CubinF =
+        C.addTempFile(C.getArgs().MakeArgString(Name));
+
+    CmdArgs.push_back(CubinF);
+  }
+
+  AddOpenMPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA);
+
+  const char *Exec =
+      Args.MakeArgString(getToolChain().GetProgramPath("nvlink"));
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
+
 /// CUDA toolchain.  Our assembler is ptxas, and our "linker" is fatbinary,
 /// which isn't properly a linker but nonetheless performs the step of stitching
 /// together object files from the assembler into a single blob.
 
 CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
-                             const ToolChain &HostTC, const ArgList &Args)
+                             const ToolChain &HostTC, const ArgList &Args,
+                             const Action::OffloadKind OK)
     : ToolChain(D, Triple, Args), HostTC(HostTC),
-      CudaInstallation(D, HostTC.getTriple(), Args) {
+      CudaInstallation(D, HostTC.getTriple(), Args), OK(OK) {
   if (CudaInstallation.isValid())
     getProgramPaths().push_back(CudaInstallation.getBinPath());
 }
@@ -392,6 +488,22 @@
   CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
+void AddMArchOption(DerivedArgList *DAL,
+                    const OptTable &Opts,
+                    StringRef Opt) {
+  if (Opt.startswith("-march=")) {
+    StringRef Arch = Opt.split("=").second;
+    // Check if the arch provided is valid for this toolchain.
+    // If not valid, ignore it.
+    if (StringToCudaArch(Arch) != CudaArch::UNKNOWN) {
+      DAL->eraseArg(options::OPT_march_EQ);
+      DAL->AddJoinedArg(nullptr,
+          Opts.getOption(options::OPT_march_EQ),
+          Arch.str());
+    }
+  }
+}
+
 llvm::opt::DerivedArgList *
 CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
                              StringRef BoundArch,
@@ -405,7 +517,7 @@
 
   // For OpenMP device offloading, append derived arguments. Make sure
   // flags are not duplicated.
-  // TODO: Append the compute capability.
+  // Also append the compute capability.
   if (DeviceOffloadKind == Action::OFK_OpenMP) {
     for (Arg *A : Args){
       bool IsDuplicate = false;
@@ -418,6 +530,49 @@
       if (!IsDuplicate)
         DAL->append(A);
     }
+
+    // Get the compute capability from the -Xopenmp-target flag.
+    auto OptList = Args.getAllArgValues(options::OPT_Xopenmp_target_EQ);
+
+    // For each OPT_Xopenmp_target_EQ option, the function returns
+    // two strings, the triple and the option.
+    // The following format is assumed:
+    //
+    // -Xopenmp-target=nvptx64-nvidia-cuda -opt=val
+    for (unsigned i = 0; i < OptList.size(); i+=2) {
+      StringRef Opt = OptList[i+1];
+      if (OptList[i] == getTripleString())
+        AddMArchOption(DAL, Opts, Opt);
+    }
+
+    OptList = Args.getAllArgValues(options::OPT_Xopenmp_target);
+    // When there is only one option in the list, the following format
+    // is assumed:
+    //
+    // -Xopenmp-target -opt=val
+
+    // By default, if no triple is explicitely specified, we
+    // associate -opt=val with the toolchain specified under the
+    // -fopenmp-targets flag (provided that there is only one such
+    // toolchain specified).
+    if (!OptList.empty() &&
+        Args.getAllArgValues(
+            options::OPT_fopenmp_targets_EQ).size() != 1)
+      getDriver().Diag(diag::err_drv_Xopenmp_target_missing_triple);
+
+    // Add arch
+    for (StringRef Opt : OptList) {
+      AddMArchOption(DAL, Opts, Opt);
+    }
+
+    auto MArchList = DAL->getAllArgValues(options::OPT_march_EQ);
+    assert(MArchList.size() < 2 &&
+           "Too many archs under -Xopenmp-targets");
+    if (MArchList.empty())
+      // Default compute capability for CUDA toolchain is sm_20.
+      DAL->AddJoinedArg(nullptr,
+          Opts.getOption(options::OPT_march_EQ), "sm_20");
+
     return DAL;
   }
 
@@ -467,6 +622,8 @@
 }
 
 Tool *CudaToolChain::buildLinker() const {
+  if (OK == Action::OFK_OpenMP)
+    return new tools::NVPTX::OpenMPLinker(*this);
   return new tools::NVPTX::Linker(*this);
 }
 
Index: lib/Driver/ToolChains/Gnu.cpp
===================================================================
--- lib/Driver/ToolChains/Gnu.cpp
+++ lib/Driver/ToolChains/Gnu.cpp
@@ -203,131 +203,6 @@
   // The types are (hopefully) good enough.
 }
 
-/// Add OpenMP linker script arguments at the end of the argument list so that
-/// the fat binary is built by embedding each of the device images into the
-/// host. The linker script also defines a few symbols required by the code
-/// generation so that the images can be easily retrieved at runtime by the
-/// offloading library. This should be used only in tool chains that support
-/// linker scripts.
-static void AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
-                                  const InputInfo &Output,
-                                  const InputInfoList &Inputs,
-                                  const ArgList &Args, ArgStringList &CmdArgs,
-                                  const JobAction &JA) {
-
-  // If this is not an OpenMP host toolchain, we don't need to do anything.
-  if (!JA.isHostOffloading(Action::OFK_OpenMP))
-    return;
-
-  // Create temporary linker script. Keep it if save-temps is enabled.
-  const char *LKS;
-  SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
-  if (C.getDriver().isSaveTempsEnabled()) {
-    llvm::sys::path::replace_extension(Name, "lk");
-    LKS = C.getArgs().MakeArgString(Name.c_str());
-  } else {
-    llvm::sys::path::replace_extension(Name, "");
-    Name = C.getDriver().GetTemporaryPath(Name, "lk");
-    LKS = C.addTempFile(C.getArgs().MakeArgString(Name.c_str()));
-  }
-
-  // Add linker script option to the command.
-  CmdArgs.push_back("-T");
-  CmdArgs.push_back(LKS);
-
-  // Create a buffer to write the contents of the linker script.
-  std::string LksBuffer;
-  llvm::raw_string_ostream LksStream(LksBuffer);
-
-  // Get the OpenMP offload tool chains so that we can extract the triple
-  // associated with each device input.
-  auto OpenMPToolChains = C.getOffloadToolChains<Action::OFK_OpenMP>();
-  assert(OpenMPToolChains.first != OpenMPToolChains.second &&
-         "No OpenMP toolchains??");
-
-  // Track the input file name and device triple in order to build the script,
-  // inserting binaries in the designated sections.
-  SmallVector<std::pair<std::string, const char *>, 8> InputBinaryInfo;
-
-  // Add commands to embed target binaries. We ensure that each section and
-  // image is 16-byte aligned. This is not mandatory, but increases the
-  // likelihood of data to be aligned with a cache block in several main host
-  // machines.
-  LksStream << "/*\n";
-  LksStream << "       OpenMP Offload Linker Script\n";
-  LksStream << " *** Automatically generated by Clang ***\n";
-  LksStream << "*/\n";
-  LksStream << "TARGET(binary)\n";
-  auto DTC = OpenMPToolChains.first;
-  for (auto &II : Inputs) {
-    const Action *A = II.getAction();
-    // Is this a device linking action?
-    if (A && isa<LinkJobAction>(A) &&
-        A->isDeviceOffloading(Action::OFK_OpenMP)) {
-      assert(DTC != OpenMPToolChains.second &&
-             "More device inputs than device toolchains??");
-      InputBinaryInfo.push_back(std::make_pair(
-          DTC->second->getTriple().normalize(), II.getFilename()));
-      ++DTC;
-      LksStream << "INPUT(" << II.getFilename() << ")\n";
-    }
-  }
-
-  assert(DTC == OpenMPToolChains.second &&
-         "Less device inputs than device toolchains??");
-
-  LksStream << "SECTIONS\n";
-  LksStream << "{\n";
-
-  // Put each target binary into a separate section.
-  for (const auto &BI : InputBinaryInfo) {
-    LksStream << "  .omp_offloading." << BI.first << " :\n";
-    LksStream << "  ALIGN(0x10)\n";
-    LksStream << "  {\n";
-    LksStream << "    PROVIDE_HIDDEN(.omp_offloading.img_start." << BI.first
-              << " = .);\n";
-    LksStream << "    " << BI.second << "\n";
-    LksStream << "    PROVIDE_HIDDEN(.omp_offloading.img_end." << BI.first
-              << " = .);\n";
-    LksStream << "  }\n";
-  }
-
-  // Add commands to define host entries begin and end. We use 1-byte subalign
-  // so that the linker does not add any padding and the elements in this
-  // section form an array.
-  LksStream << "  .omp_offloading.entries :\n";
-  LksStream << "  ALIGN(0x10)\n";
-  LksStream << "  SUBALIGN(0x01)\n";
-  LksStream << "  {\n";
-  LksStream << "    PROVIDE_HIDDEN(.omp_offloading.entries_begin = .);\n";
-  LksStream << "    *(.omp_offloading.entries)\n";
-  LksStream << "    PROVIDE_HIDDEN(.omp_offloading.entries_end = .);\n";
-  LksStream << "  }\n";
-  LksStream << "}\n";
-  LksStream << "INSERT BEFORE .data\n";
-  LksStream.flush();
-
-  // Dump the contents of the linker script if the user requested that. We
-  // support this option to enable testing of behavior with -###.
-  if (C.getArgs().hasArg(options::OPT_fopenmp_dump_offload_linker_script))
-    llvm::errs() << LksBuffer;
-
-  // If this is a dry run, do not create the linker script file.
-  if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
-    return;
-
-  // Open script file and write the contents.
-  std::error_code EC;
-  llvm::raw_fd_ostream Lksf(LKS, EC, llvm::sys::fs::F_None);
-
-  if (EC) {
-    C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
-    return;
-  }
-
-  Lksf << LksBuffer;
-}
-
 static bool addXRayRuntime(const ToolChain &TC, const ArgList &Args,
                            ArgStringList &CmdArgs) {
   if (Args.hasFlag(options::OPT_fxray_instrument,
Index: test/Driver/openmp-offload.c
===================================================================
--- test/Driver/openmp-offload.c
+++ test/Driver/openmp-offload.c
@@ -597,3 +597,76 @@
 // RUN:   | FileCheck -check-prefix=CHK-FOPENMP-IS-DEVICE %s
 
 // CHK-FOPENMP-IS-DEVICE: clang{{.*}} "-aux-triple" "powerpc64le--linux" {{.*}}.c" "-fopenmp-is-device" "-fopenmp-host-ir-file-path"
+
+/// ###########################################################################
+
+/// Check -Xopenmp-target=powerpc64le-ibm-linux-gnu -march=pwr8 is passed when compiling for the device.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=powerpc64le-ibm-linux-gnu -Xopenmp-target=powerpc64le-ibm-linux-gnu -march=pwr8 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-FOPENMP-EQ-TARGET %s
+
+// CHK-FOPENMP-EQ-TARGET: clang{{.*}} argument unused during compilation: '-Xopenmp-target=powerpc64le-ibm-linux-gnu -march=pwr8'
+
+/// ###########################################################################
+
+/// Check -Xopenmp-target -march=pwr8 is passed when compiling for the device.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=powerpc64le-ibm-linux-gnu -Xopenmp-target -march=pwr8 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-FOPENMP-TARGET %s
+
+// CHK-FOPENMP-TARGET: clang{{.*}} argument unused during compilation: '-Xopenmp-target -march=pwr8'
+
+/// ###########################################################################
+
+/// Check -Xopenmp-target triggers error when multiple triples are used.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=powerpc64le-ibm-linux-gnu,nvptx64-nvidia-cuda -Xopenmp-target -march=pwr8 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-FOPENMP-TARGET-ERROR %s
+
+// CHK-FOPENMP-TARGET-ERROR: clang{{.*}} error: cannot deduce implicit triple value for -Xopenmp-target, specify triple using -Xopenmp-target=<triple>
+
+/// ###########################################################################
+
+/// Check -Xopenmp-target uses one of the archs provided when several archs are used.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_35 -Xopenmp-target -march=sm_60 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-FOPENMP-TARGET-ARCHS %s
+
+// CHK-FOPENMP-TARGET-ARCHS: ptxas{{.*}}" "--gpu-name" "sm_60"
+// CHK-FOPENMP-TARGET-ARCHS: nvlink{{.*}}" "-arch" "sm_60"
+
+/// ###########################################################################
+
+/// Check -Xopenmp-target -march=pwr8 is ignored in favor of sm_20 (default).
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=powerpc64le-ibm-linux-gnu,nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvidia-cuda -march=pwr8 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-FOPENMP-TARGET-WARNING %s
+
+// CHK-FOPENMP-TARGET-WARNING: ptxas{{.*}}" "--gpu-name" "sm_20"
+// CHK-FOPENMP-TARGET-WARNING: nvlink{{.*}}" "-arch" "sm_20"
+
+/// ###########################################################################
+
+/// Check -Xopenmp-target -march=sm_35 works as expected when two triples are present.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=powerpc64le-ibm-linux-gnu,nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_35 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-FOPENMP-TARGET-COMPILATION %s
+
+// CHK-FOPENMP-TARGET-COMPILATION: ptxas{{.*}}" "--gpu-name" "sm_35"
+// CHK-FOPENMP-TARGET-COMPILATION: nvlink{{.*}}" "-arch" "sm_35"
+
+/// ###########################################################################
+
+/// Check cubin file generation and usage by nvlink
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -save-temps -no-canonical-prefixes %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-CUBIN %s
+
+// CHK-CUBIN: clang{{.*}}" "-o" "{{.*}}-openmp-nvptx64-nvidia-cuda.s"
+// CHK-CUBIN-NEXT: ptxas{{.*}}" "--output-file" "{{.*}}-openmp-nvptx64-nvidia-cuda.cubin" "{{.*}}-openmp-nvptx64-nvidia-cuda.s"
+// CHK-CUBIN-NEXT: nvlink" "-o" "{{.*}}-openmp-nvptx64-nvidia-cuda" {{.*}} "openmp-offload-openmp-nvptx64-nvidia-cuda.cubin"
+
+/// ###########################################################################
+
+/// Check cubin file generation and usage by nvlink
+// RUN:   touch %t1.o
+// RUN:   touch %t2.o
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -save-temps -no-canonical-prefixes %t1.o %t2.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-TWOCUBIN %s
+
+// CHK-TWOCUBIN: clang-offload-bundler" "-type=o" "{{.*}}inputs={{.*}}tmp1.o" "-outputs={{.*}}.o,{{.*}}tmp1-openmp-nvptx64-nvidia-cuda.cubin" "-unbundle"
+// CHK-TWOCUBIN-NEXT: clang-offload-bundler" "-type=o" "{{.*}}inputs={{.*}}tmp2.o" "-outputs={{.*}}.o,{{.*}}tmp2-openmp-nvptx64-nvidia-cuda.cubin" "-unbundle"
+// CHK-TWOCUBIN-NEXT: nvlink" "-o" "{{.*}}-openmp-nvptx64-nvidia-cuda" {{.*}} "openmp-offload.c.tmp1-openmp-nvptx64-nvidia-cuda.cubin" "openmp-offload.c.tmp2-openmp-nvptx64-nvidia-cuda.cubin"