diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -166,11 +166,11 @@
 
 /// Information for a device offloading file extracted from the host.
 struct DeviceFile {
-  DeviceFile(StringRef Kind, StringRef TheTriple, StringRef Arch,
+  DeviceFile(OffloadKind Kind, StringRef TheTriple, StringRef Arch,
              StringRef Filename)
       : Kind(Kind), TheTriple(TheTriple), Arch(Arch), Filename(Filename) {}
 
-  std::string Kind;
+  OffloadKind Kind;
   std::string TheTriple;
   std::string Arch;
   std::string Filename;
@@ -181,15 +181,30 @@
 /// assume device files with matching architectures and triples but different
 /// offloading kinds should be handlded together, this may not be true in the
 /// future.
+
+// Provide DenseMapInfo for OffloadKind.
+template <> struct DenseMapInfo<OffloadKind> {
+  static inline OffloadKind getEmptyKey() {
+    return static_cast<OffloadKind>(0xFFFF);
+  }
+  static inline OffloadKind getTombstoneKey() {
+    return static_cast<OffloadKind>(0xFFFF - 1);
+  }
+  static unsigned getHashValue(const OffloadKind &Val) { return Val * 37U; }
+
+  static bool isEqual(const OffloadKind &LHS, const OffloadKind &RHS) {
+    return LHS == RHS;
+  }
+};
 template <> struct DenseMapInfo<DeviceFile> {
   static DeviceFile getEmptyKey() {
-    return {DenseMapInfo<StringRef>::getEmptyKey(),
+    return {static_cast<OffloadKind>(DenseMapInfo<uint16_t>::getEmptyKey()),
             DenseMapInfo<StringRef>::getEmptyKey(),
             DenseMapInfo<StringRef>::getEmptyKey(),
             DenseMapInfo<StringRef>::getEmptyKey()};
   }
   static DeviceFile getTombstoneKey() {
-    return {DenseMapInfo<StringRef>::getTombstoneKey(),
+    return {static_cast<OffloadKind>(DenseMapInfo<uint16_t>::getTombstoneKey()),
             DenseMapInfo<StringRef>::getTombstoneKey(),
             DenseMapInfo<StringRef>::getTombstoneKey(),
             DenseMapInfo<StringRef>::getTombstoneKey()};
@@ -230,7 +245,7 @@
   auto DeviceAndPath = StringRef(LibraryStr).split('=');
   auto StringAndArch = DeviceAndPath.first.rsplit('-');
   auto KindAndTriple = StringAndArch.first.split('-');
-  return DeviceFile(KindAndTriple.first, KindAndTriple.second,
+  return DeviceFile(getOffloadKind(KindAndTriple.first), KindAndTriple.second,
                     StringAndArch.second, DeviceAndPath.second);
 }
 
@@ -360,8 +375,8 @@
     if (Error E = Output->commit())
       return E;
 
-    DeviceFiles.emplace_back(Kind, Binary.getTriple(), Binary.getArch(),
-                             TempFile);
+    DeviceFiles.emplace_back(Binary.getOffloadKind(), Binary.getTriple(),
+                             Binary.getArch(), TempFile);
 
     Offset += Binary.getSize();
   }
@@ -680,6 +695,39 @@
 
   return static_cast<std::string>(TempFile);
 }
+
+Expected<std::string> fatbinary(ArrayRef<StringRef> InputFiles,
+                                Triple TheTriple, ArrayRef<StringRef> Archs) {
+  // NVPTX uses the fatbinary program to bundle the linked images.
+  Expected<std::string> FatBinaryPath =
+      findProgram("fatbinary", {CudaBinaryPath});
+  if (!FatBinaryPath)
+    return FatBinaryPath.takeError();
+
+  // Create a new file to write the linked device image to.
+  SmallString<128> TempFile;
+  if (Error Err = createOutputFile(sys::path::filename(ExecutableName) +
+                                       "-device-" + TheTriple.getArchName(),
+                                   "fatbin", TempFile))
+    return std::move(Err);
+
+  BumpPtrAllocator Alloc;
+  StringSaver Saver(Alloc);
+
+  SmallVector<StringRef, 16> CmdArgs;
+  CmdArgs.push_back(*FatBinaryPath);
+  CmdArgs.push_back(TheTriple.isArch64Bit() ? "-64" : "-32");
+  CmdArgs.push_back("--create");
+  CmdArgs.push_back(TempFile);
+  for (const auto &FileAndArch : llvm::zip(InputFiles, Archs))
+    CmdArgs.push_back(Saver.save("--image=profile=" + std::get<1>(FileAndArch) +
+                                 ",file=" + std::get<0>(FileAndArch)));
+
+  if (Error Err = executeCommands(*FatBinaryPath, CmdArgs))
+    return std::move(Err);
+
+  return static_cast<std::string>(TempFile);
+}
 } // namespace nvptx
 namespace amdgcn {
 Expected<std::string> link(ArrayRef<std::string> InputFiles, Triple TheTriple,
@@ -1124,43 +1172,53 @@
 /// Runs the appropriate linking action on all the device files specified in \p
 /// DeviceFiles. The linked device images are returned in \p LinkedImages.
 Error linkDeviceFiles(ArrayRef<DeviceFile> DeviceFiles,
-                      SmallVectorImpl<std::string> &LinkedImages) {
-  // Get the list of inputs for a specific device.
+                      SmallVectorImpl<DeviceFile> &LinkedImages) {
+  // Get the list of inputs and active offload kinds for a specific device.
   DenseMap<DeviceFile, SmallVector<std::string, 4>> LinkerInputMap;
-  for (auto &File : DeviceFiles)
+  DenseMap<DeviceFile, DenseSet<OffloadKind>> ActiveOffloadKinds;
+  for (auto &File : DeviceFiles) {
+    ActiveOffloadKinds[File].insert(File.Kind);
     LinkerInputMap[File].push_back(File.Filename);
+  }
 
   // Try to link each device toolchain.
   for (auto &LinkerInput : LinkerInputMap) {
     DeviceFile &File = LinkerInput.getFirst();
     Triple TheTriple = Triple(File.TheTriple);
+    auto &LinkerInputFiles = LinkerInput.getSecond();
     bool WholeProgram = false;
 
     // Run LTO on any bitcode files and replace the input with the result.
-    if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple,
-                                     File.Arch, WholeProgram))
+    if (Error Err = linkBitcodeFiles(LinkerInputFiles, TheTriple, File.Arch,
+                                     WholeProgram))
       return Err;
 
     // If we are embedding bitcode for JIT, skip the final device linking.
     if (EmbedBitcode) {
-      assert(!LinkerInput.getSecond().empty() && "No bitcode image to embed");
-      LinkedImages.push_back(LinkerInput.getSecond().front());
+      assert(!LinkerInputFiles.empty() && "No bitcode image to embed");
+      LinkedImages.emplace_back(OFK_OpenMP, TheTriple.getTriple(), File.Arch,
+                                LinkerInputFiles.front());
       continue;
     }
 
     // If we performed LTO on NVPTX and had whole program visibility, we can use
     // CUDA in non-RDC mode.
     if (WholeProgram && TheTriple.isNVPTX()) {
-      assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed");
-      LinkedImages.push_back(LinkerInput.getSecond().front());
+      assert(!LinkerInputFiles.empty() && "No non-RDC image to embed");
+      for (OffloadKind Kind : ActiveOffloadKinds[LinkerInput.getFirst()])
+        LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch,
+                                  LinkerInputFiles.front());
       continue;
     }
 
-    auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch);
+    auto ImageOrErr = linkDevice(LinkerInputFiles, TheTriple, File.Arch);
     if (!ImageOrErr)
       return ImageOrErr.takeError();
 
-    LinkedImages.push_back(*ImageOrErr);
+    // Create separate images for all the active offload kinds.
+    for (OffloadKind Kind : ActiveOffloadKinds[LinkerInput.getFirst()])
+      LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch,
+                                *ImageOrErr);
   }
   return Error::success();
 }
@@ -1205,32 +1263,94 @@
   return static_cast<std::string>(ObjectFile);
 }
 
-/// Creates the object file containing the device image and runtime registration
-/// code from the device images stored in \p Images.
-Expected<std::string> wrapDeviceImages(ArrayRef<std::string> Images) {
+/// Load all of the OpenMP images into a buffer and pass it to the binary
+/// wrapping function to create the registration code in the module \p M.
+Error wrapOpenMPImages(Module &M, ArrayRef<DeviceFile> Images) {
   SmallVector<std::unique_ptr<MemoryBuffer>, 4> SavedBuffers;
   SmallVector<ArrayRef<char>, 4> ImagesToWrap;
-
-  for (StringRef ImageFilename : Images) {
+  for (const DeviceFile &File : Images) {
     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> ImageOrError =
-        llvm::MemoryBuffer::getFileOrSTDIN(ImageFilename);
+        llvm::MemoryBuffer::getFileOrSTDIN(File.Filename);
     if (std::error_code EC = ImageOrError.getError())
-      return createFileError(ImageFilename, EC);
+      return createFileError(File.Filename, EC);
     ImagesToWrap.emplace_back((*ImageOrError)->getBufferStart(),
                               (*ImageOrError)->getBufferSize());
     SavedBuffers.emplace_back(std::move(*ImageOrError));
   }
 
-  LLVMContext Context;
-  Module M("offload.wrapper.module", Context);
-  M.setTargetTriple(HostTriple);
-  if (Error Err = wrapBinaries(M, ImagesToWrap))
-    return std::move(Err);
+  if (Error Err = wrapOpenMPBinaries(M, ImagesToWrap))
+    return Err;
+  return Error::success();
+}
+
+/// Combine all of the CUDA images into a single fatbinary and pass it to the
+/// binary wrapping function to create the registration code in the module \p M.
+Error wrapCudaImages(Module &M, ArrayRef<DeviceFile> Images) {
+  SmallVector<StringRef, 4> InputFiles;
+  SmallVector<StringRef, 4> Architectures;
+  for (const DeviceFile &File : Images) {
+    InputFiles.push_back(File.Filename);
+    Architectures.push_back(File.Arch);
+  }
+
+  // CUDA expects its embedded device images to be a fatbinary.
+  Triple TheTriple = Triple(Images.front().TheTriple);
+  auto FileOrErr = nvptx::fatbinary(InputFiles, TheTriple, Architectures);
+  if (!FileOrErr)
+    return FileOrErr.takeError();
+
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> ImageOrError =
+      llvm::MemoryBuffer::getFileOrSTDIN(*FileOrErr);
+  if (std::error_code EC = ImageOrError.getError())
+    return createFileError(*FileOrErr, EC);
+
+  auto ImageToWrap = ArrayRef<char>((*ImageOrError)->getBufferStart(),
+                                    (*ImageOrError)->getBufferSize());
 
-  if (PrintWrappedModule)
-    llvm::errs() << M;
+  if (Error Err = wrapCudaBinary(M, ImageToWrap))
+    return Err;
+  return Error::success();
+}
+
+/// Creates the object file containing the device image and runtime
+/// registration code from the device images stored in \p Images.
+Expected<SmallVector<std::string, 2>>
+wrapDeviceImages(ArrayRef<DeviceFile> Images) {
+  DenseMap<OffloadKind, SmallVector<DeviceFile, 2>> ImagesForKind;
+  for (const DeviceFile &Image : Images)
+    ImagesForKind[Image.Kind].push_back(Image);
+
+  SmallVector<std::string, 2> WrappedImages;
+  for (const auto &KindAndImages : ImagesForKind) {
+    LLVMContext Context;
+    Module M("offload.wrapper.module", Context);
+    M.setTargetTriple(HostTriple);
+
+    switch (KindAndImages.getFirst()) {
+    case OFK_OpenMP:
+      if (Error Err = wrapOpenMPImages(M, KindAndImages.getSecond()))
+        return std::move(Err);
+      break;
+    case OFK_Cuda:
+      if (Error Err = wrapCudaImages(M, KindAndImages.getSecond()))
+        return std::move(Err);
+      break;
+    default:
+      return createStringError(inconvertibleErrorCode(),
+                               getOffloadKindName(KindAndImages.getFirst()) +
+                                   " wrapping is not supported");
+    }
+
+    if (PrintWrappedModule)
+      llvm::errs() << M;
+
+    auto FileOrErr = compileModule(M);
+    if (!FileOrErr)
+      return FileOrErr.takeError();
+    WrappedImages.push_back(*FileOrErr);
+  }
 
-  return compileModule(M);
+  return WrappedImages;
 }
 
 Optional<std::string> findFile(StringRef Dir, const Twine &Name) {
@@ -1361,7 +1481,7 @@
     DeviceFiles.push_back(getBitcodeLibrary(LibraryStr));
 
   // Link the device images extracted from the linker input.
-  SmallVector<std::string, 16> LinkedImages;
+  SmallVector<DeviceFile, 4> LinkedImages;
   if (Error Err = linkDeviceFiles(DeviceFiles, LinkedImages))
     return reportError(std::move(Err));
 
@@ -1370,7 +1490,7 @@
   auto FileOrErr = wrapDeviceImages(LinkedImages);
   if (!FileOrErr)
     return reportError(FileOrErr.takeError());
-  LinkerArgs.push_back(*FileOrErr);
+  LinkerArgs.append(*FileOrErr);
 
   // Run the host linking job.
   if (Error Err = runLinker(LinkerUserPath, LinkerArgs))
diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.h b/clang/tools/clang-linker-wrapper/OffloadWrapper.h
--- a/clang/tools/clang-linker-wrapper/OffloadWrapper.h
+++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.h
@@ -1,4 +1,4 @@
-//===- OffloadWrapper.h -------------------------------------------*- C++ -*-===//
+//===- OffloadWrapper.h --r-------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,7 +14,11 @@
 
 /// Wrap the input device images into the module \p M as global symbols and
 /// registers the images with the OpenMP Offloading runtime libomptarget.
-llvm::Error wrapBinaries(llvm::Module &M,
-                         llvm::ArrayRef<llvm::ArrayRef<char>> Images);
+llvm::Error wrapOpenMPBinaries(llvm::Module &M,
+                               llvm::ArrayRef<llvm::ArrayRef<char>> Images);
+
+/// Wrap the input fatbinary image into the module \p M as global symbols and
+/// registers the images with the CUDA runtime.
+llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef<char> Images);
 
 #endif
diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
--- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -257,7 +257,7 @@
 
 } // namespace
 
-Error wrapBinaries(Module &M, ArrayRef<ArrayRef<char>> Images) {
+Error wrapOpenMPBinaries(Module &M, ArrayRef<ArrayRef<char>> Images) {
   GlobalVariable *Desc = createBinDesc(M, Images);
   if (!Desc)
     return createStringError(inconvertibleErrorCode(),
@@ -266,3 +266,8 @@
   createUnregisterFunction(M, Desc);
   return Error::success();
 }
+
+llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef<char> Images) {
+  return createStringError(inconvertibleErrorCode(),
+                           "Cuda wrapping is not yet supported.");
+}