diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -40,6 +40,8 @@
 
 #define GEN_PASS_DECL_GPUTOLLVMCONVERSIONPASS
 #include "mlir/Conversion/Passes.h.inc"
+#define GEN_PASS_DECL_GPUTOOFFLOADCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
 
 using OwnedBlob = std::unique_ptr<std::vector<char>>;
 using BlobGenerator =
@@ -54,6 +56,18 @@
                                          StringRef gpuBinaryAnnotation = {},
                                          bool kernelBarePtrCallConv = false);
 
+/// Collect a set of patterns to convert from the GPU dialect to LLVM with
+/// offload annotations and populate the converter for gpu types.
+void populateGpuToLLVMOffloadConversionPatterns(
+    LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    bool kernelBarePtrCallConv = false);
+
+/// Utility function that concatenates all offloading objects in a module
+/// into a single object and adds it at the start of the module. If there are no
+/// gpu.modules in the module it doesn't add anything to the module. Returns
+/// failure if a gpu.module doesn't contain a valid offload annotation.
+LogicalResult makeLLVMOffloadObject(ModuleOp module,
+                                    LLVMTypeConverter &converter);
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -381,6 +381,39 @@
   ];
 }
 
+def GpuToOffloadConversionPass : Pass<"gpu-to-offload", "ModuleOp"> {
+  let summary = "Convert GPU dialect to LLVM dialect with offload annotations.";
+
+  let description = [{
+    Creates a pass to convert a GPU operations into a sequence of GPU runtime
+    calls and adds clang compatible offload annotations for clang's new offload
+    driver.
+
+    This pass does not generate code to call GPU runtime APIs directly but
+    instead uses a small wrapper library that exports a stable and conveniently
+    typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP).
+    For more information:
+    https://clang.llvm.org/docs/OffloadingDesign.html
+    https://clang.llvm.org/docs/ClangOffloadPackager.html
+  }];
+
+  let options = [
+    Option<"kernelBarePtrCallConv", "use-bare-pointers-for-kernels", "bool",
+           /*default=*/"false",
+             "Use bare pointers to pass memref arguments to kernels. "
+             "The kernel must use the same setting for this option."
+           >,
+    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
+           /*default=*/"true", "Generate LLVM IR using opaque pointers "
+           "instead of typed pointers">,
+  ];
+
+  let dependentDialects = [
+  	"::mlir::LLVM::LLVMDialect",
+    "memref::MemRefDialect",
+  ];
+}
+
 def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> {
   let summary = "Lowers the host module code and `gpu.launch_func` to LLVM";
 
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -31,6 +31,14 @@
 class FuncOp;
 } // namespace func
 
+namespace gpu {
+/// Returns the default cuda toolkit path, none if it wasn't found.
+StringRef getDefaultCudaToolkitPath();
+
+/// Returns the default ROCm path.
+StringRef getDefaultRocmPath();
+} // namespace gpu
+
 #define GEN_PASS_DECL
 #include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
 
@@ -62,6 +70,33 @@
 }
 
 namespace gpu {
+/// Define gpu offload kinds.
+enum class OffloadKind { unk = -1, cuda, hip };
+
+/// Convert a string to offload kind.
+inline OffloadKind getOffloadKind(StringRef kind) {
+  if (kind == "cuda")
+    return OffloadKind::cuda;
+  else if (kind == "hip")
+    return OffloadKind::hip;
+  return OffloadKind::unk;
+}
+
+/// Get the string representation of an offload kind.
+inline StringRef fromOffloadKind(OffloadKind kind) {
+  if (kind == OffloadKind::cuda)
+    return "cuda";
+  else if (kind == OffloadKind::hip)
+    return "hip";
+  return "unk";
+}
+
+/// Returns the annotation name for GPU offload object blobs.
+StringRef getGpuOffloadObjectAnnotation();
+
+/// Returns the annotation name for GPU offload kind.
+StringRef getGpuOffloadKindAnnotation();
+
 /// Base pass class to serialize kernel functions through LLVM into
 /// user-specified IR and add the resulting blob as module attribute.
 class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -37,4 +37,101 @@
   let dependentDialects = ["mlir::gpu::GPUDialect"];
 }
 
+def GpuNameMangling: Pass<"gpu-name-mangling",
+                          "::mlir::ModuleOp"> {
+  let summary = "Mangle the names of all the top symbols inside a gpu.module.";
+  let description = [{
+    Mangle the names of all the top level definitions inside a `gpu.module`
+    for all the `gpu.module`s inside a `module`, from: `<symbol>` to:
+    `__G<gpu module name>_S<symbol>`, and updates all the symbol references.
+  }];
+  let dependentDialects = ["mlir::gpu::GPUDialect"];
+}
+
+def GpuToNVPTXOffload: Pass<"gpu-to-nvptx",
+                            "::mlir::gpu::GPUModuleOp"> {
+  let summary = "Lowers a `gpu.module` to a binary annotation with NVVM code.";
+  let description = [{
+    Lowers a `gpu.module` with NVVM LLVM IR to a binary annotation compatible 
+    with clang's new offload driver, and with the tool: `clang-offload-packager`.
+    For more information:
+    https://clang.llvm.org/docs/OffloadingDesign.html
+    https://clang.llvm.org/docs/ClangOffloadPackager.html
+  }];
+  let options = [
+    Option<"triple", "triple", "std::string",
+           /*default=*/ "\"nvptx64-nvidia-cuda\"",
+           "Target triple.">,
+    Option<"chip", "chip", "std::string",
+           /*default=*/"\"sm_35\"",
+           "Target chip.">,
+    Option<"features", "features", "std::string",
+           /*default=*/"\"+ptx60\"",
+           "Target features.">,
+    Option<"cudaPath", "cuda-path", "std::string",
+           /*default=*/"gpu::getDefaultCudaToolkitPath().str()",
+           "CUDA Toolkit path.">,
+    Option<"optLevel", "O", "unsigned",
+           /*default=*/"0",
+           "Optimization level.">,
+    ListOption<"bcPaths", "libs", "std::string",
+           "Extra bitcode libraries paths.">,
+  ];
+  let dependentDialects = ["mlir::gpu::GPUDialect"];
+}
+
+def GpuToAMDGPUOffload: Pass<"gpu-to-amdgpu",
+                             "::mlir::gpu::GPUModuleOp"> {
+  let summary = "Lowers a `gpu.module` to a binary annotation with ROCDl code.";
+  let description = [{
+    Lowers a `gpu.module` with ROCDl LLVM IR to a binary annotation compatible 
+    with clang's new offload driver, and with the tool: `clang-offload-packager`.
+    For more information:
+    https://clang.llvm.org/docs/OffloadingDesign.html
+    https://clang.llvm.org/docs/ClangOffloadPackager.html
+  }];
+  let options = [
+    Option<"triple", "triple", "std::string",
+           /*default=*/ "\"amdgcn-amd-amdhsa\"",
+           "Target triple.">,
+    Option<"chip", "chip", "std::string",
+           /*default=*/"\"generic\"",
+           "Target chip.">,
+    Option<"features", "features", "std::string",
+           /*default=*/"\"\"",
+           "Target features.">,
+    Option<"rocmPath", "rocm-path", "std::string",
+           /*default=*/"gpu::getDefaultRocmPath().str()",
+           "ROCm path.">,
+    Option<"optLevel", "O", "unsigned",
+           /*default=*/"0",
+           "Optimization level.">,
+    Option<"wave64", "wave64", "bool",
+           /*default=*/"true",
+           "Use Wave64 mode.">,
+    Option<"daz", "daz", "bool",
+           /*default=*/"false",
+           "Enable denormals are zero opt.">,
+    Option<"finiteOnly", "finite-only", "bool",
+           /*default=*/"false",
+           "Enable finite only opt.">,
+    Option<"unsafeMath", "unsafe-math", "bool",
+           /*default=*/"false",
+           "Enable unsafe math opt.">,
+    Option<"fastMath", "fast-math", "bool",
+           /*default=*/"false",
+           "Enable fast relaxed math opt.">,
+    Option<"correctSqrt", "correct-sqrt", "bool",
+           /*default=*/"true",
+           "Enable correct rounded sqrt.">,
+    Option<"abiVer", "abi-ver", "std::string",
+           /*default=*/"\"400\"",
+           "ABI version.">,
+    ListOption<"bcPaths", "libs", "std::string",
+           "Extra bitcode libraries paths.">,
+  ];
+  let dependentDialects = ["mlir::gpu::GPUDialect"];
+}
+
 #endif // MLIR_DIALECT_GPU_PASSES
+
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -41,6 +41,8 @@
 namespace mlir {
 #define GEN_PASS_DEF_GPUTOLLVMCONVERSIONPASS
 #include "mlir/Conversion/Passes.h.inc"
+#define GEN_PASS_DEF_GPUTOOFFLOADCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
 } // namespace mlir
 
 using namespace mlir;
@@ -58,6 +60,62 @@
   void runOnOperation() override;
 };
 
+class GpuToOffloadConversionPass
+    : public impl::GpuToOffloadConversionPassBase<GpuToOffloadConversionPass> {
+public:
+  using Base::Base;
+
+  void runOnOperation() final;
+};
+
+// Class for creating and managing offload entries & objects.
+// For more information on these entries:
+// https://clang.llvm.org/docs/OffloadingDesign.html#generating-offloading-entries
+// https://clang.llvm.org/docs/ClangOffloadPackager.html
+// https://clang.llvm.org/docs/ClangLinkerWrapper.html
+struct GPUOffloadBuilder {
+  // Offload string constants.
+  static constexpr char kOffloadStructTypeName[] = "struct.__tgt_offload_entry";
+  static constexpr char kOffloadEntryName[] = ".omp_offloading.entry_name.";
+  static constexpr char kOffloadEntry[] = ".omp_offloading.entry.";
+
+  static constexpr char kOffloadObjectGlobalId[] = "llvm.embedded.object";
+  static constexpr char kCompilerUsedGlobalId[] = "llvm.compiler.used";
+
+  static constexpr char kLLVMOffloadingSection[] = ".llvm.offloading";
+  static constexpr char kLLVMMetadataSection[] = "llvm.metadata";
+  static constexpr char kCudaOffloadingSection[] = "cuda_offloading_entries";
+  static constexpr char kHipOffloadingSection[] = "hip_offloading_entries";
+
+  // Get the offloading entry type, described in:
+  static LLVM::LLVMStructType getOffloadEntryType(MLIRContext *context);
+
+  // Create an unique identifier for the kernel combining the module and kernel
+  // name: __M<module name>_K<kernel name>.
+  static std::string getUniqueIdentifier(gpu::LaunchFuncOp kernelLaunch);
+
+  // Create an identifier for the kernel stub: <unique identifier>_stub.
+  static std::string getStubIdentifier(gpu::LaunchFuncOp kernelLaunch);
+
+  // Create a kernel host stub, needed for function registration by the
+  // runtimes.
+  static std::string createKernelStub(gpu::LaunchFuncOp kernelLaunch,
+                                      OpBuilder &builder);
+
+  // Inserts an offloading entry into the the top module.
+  static LogicalResult insertOffloadEntry(gpu::LaunchFuncOp kernelLaunch,
+                                          OpBuilder &builder);
+
+  // Calls all the required methods to obtain a valid offloading launch address.
+  static Value getOrCreateLaunchAddress(gpu::LaunchFuncOp kernelLaunch,
+                                        OpBuilder &builder);
+
+  // Concatenates all offloading objects into a single object and adds it at the
+  // start of the top module.
+  static LogicalResult createOffloadObject(ModuleOp module,
+                                           LLVMTypeConverter &converter);
+};
+
 struct FunctionCallBuilder {
   FunctionCallBuilder(StringRef functionName, Type returnType,
                       ArrayRef<Type> argumentTypes)
@@ -308,10 +366,12 @@
 public:
   ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter,
                                              StringRef gpuBinaryAnnotation,
-                                             bool kernelBarePtrCallConv)
+                                             bool kernelBarePtrCallConv,
+                                             bool llvmOffload = false)
       : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter),
         gpuBinaryAnnotation(gpuBinaryAnnotation),
-        kernelBarePtrCallConv(kernelBarePtrCallConv) {}
+        kernelBarePtrCallConv(kernelBarePtrCallConv), llvmOffload(llvmOffload) {
+  }
 
 private:
   Value generateParamsArray(gpu::LaunchFuncOp launchOp, OpAdaptor adaptor,
@@ -325,6 +385,7 @@
 
   llvm::SmallString<32> gpuBinaryAnnotation;
   bool kernelBarePtrCallConv;
+  bool llvmOffload;
 };
 
 class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
@@ -408,6 +469,315 @@
     signalPassFailure();
 }
 
+void GpuToOffloadConversionPass::runOnOperation() {
+  LowerToLLVMOptions options(&getContext());
+  options.useOpaquePointers = useOpaquePointers;
+
+  LLVMTypeConverter converter(&getContext(), options);
+
+  // Create the offload object.
+  if (failed(GPUOffloadBuilder::createOffloadObject(getOperation(), converter)))
+    signalPassFailure();
+
+  RewritePatternSet patterns(&getContext());
+  LLVMConversionTarget target(getContext());
+
+  target.addIllegalDialect<gpu::GPUDialect>();
+
+  mlir::arith::populateArithToLLVMConversionPatterns(converter, patterns);
+  mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns);
+  populateVectorToLLVMConversionPatterns(converter, patterns);
+  populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns);
+  populateFuncToLLVMConversionPatterns(converter, patterns);
+  populateAsyncStructuralTypeConversionsAndLegality(converter, patterns,
+                                                    target);
+  populateGpuToLLVMOffloadConversionPatterns(converter, patterns,
+                                             kernelBarePtrCallConv);
+
+  if (failed(
+          applyPartialConversion(getOperation(), target, std::move(patterns))))
+    signalPassFailure();
+}
+
+LogicalResult makeLLVMOffloadObject(ModuleOp module,
+                                    LLVMTypeConverter &converter) {
+  return GPUOffloadBuilder::createOffloadObject(module, converter);
+}
+
+LLVM::LLVMStructType
+GPUOffloadBuilder::getOffloadEntryType(MLIRContext *context) {
+  auto type =
+      LLVM::LLVMStructType::getIdentified(context, kOffloadStructTypeName);
+  if (!type.isInitialized()) {
+    // Create the offload struct entry according to:
+    // https://clang.llvm.org/docs/OffloadingDesign.html#generating-offloading-entries
+    auto ptrType = LLVM::LLVMPointerType::get(context);
+    auto i32Type = IntegerType::get(context, 32);
+    auto i64Type = IntegerType::get(context, 64);
+    auto result =
+        type.setBody({ptrType, ptrType, i64Type, i32Type, i32Type}, false);
+    if (!result.succeeded())
+      return nullptr;
+  }
+  return type;
+}
+
+std::string
+GPUOffloadBuilder::getUniqueIdentifier(gpu::LaunchFuncOp kernelLaunch) {
+  return "_M" + kernelLaunch.getKernelModuleName().str() + "_K" +
+         kernelLaunch.getKernelName().str();
+}
+
+std::string
+GPUOffloadBuilder::getStubIdentifier(gpu::LaunchFuncOp kernelLaunch) {
+  return getUniqueIdentifier(kernelLaunch) + "_stub";
+}
+
+std::string GPUOffloadBuilder::createKernelStub(gpu::LaunchFuncOp kernelLaunch,
+                                                OpBuilder &builder) {
+  // Create the stub name.
+  auto name = getStubIdentifier(kernelLaunch);
+
+  // Get the top module.
+  auto module = kernelLaunch->getParentOfType<ModuleOp>();
+  {
+    // Avoid inserting the stub more than once.
+    auto op = module.lookupSymbol(name);
+    if (op)
+      return name;
+  }
+  auto insertionGuard = ConversionPatternRewriter::InsertionGuard(builder);
+  builder.setInsertionPointToStart(&module.getRegion().front());
+
+  // Create a simple function stub `void()`, what's important is the address of
+  // the function for doing the kernel registration.
+  auto voidType = LLVM::LLVMVoidType::get(builder.getContext());
+  auto func = builder.create<LLVM::LLVMFuncOp>(
+      kernelLaunch.getLoc(), name, LLVM::LLVMFunctionType::get(voidType, {}),
+      LLVM::Linkage::External, true);
+  auto block = func.addEntryBlock();
+  builder.setInsertionPointToStart(block);
+  builder.create<LLVM::ReturnOp>(kernelLaunch.getLoc(), ValueRange());
+  return name;
+}
+
+LogicalResult
+GPUOffloadBuilder::insertOffloadEntry(gpu::LaunchFuncOp kernelLaunch,
+                                      OpBuilder &builder) {
+
+  using namespace LLVM;
+  auto module = kernelLaunch->getParentOfType<ModuleOp>();
+  if (!module) {
+    emitError(kernelLaunch.getLoc(), "operation is not inside of a ModuleOp.");
+    return failure();
+  }
+  // Create the identifiers for the entries.
+  auto name = kernelLaunch.getKernelName();
+  auto kernelUid = getUniqueIdentifier(kernelLaunch);
+  std::string entryNameId = kOffloadEntryName + kernelUid;
+  std::string entryId = kOffloadEntry + kernelUid;
+  {
+    // Avoid inserting the entry more than once.
+    auto entryName = module.lookupSymbol(entryNameId);
+    auto entry = module.lookupSymbol(entryId);
+    // Entries are already there.
+    if (entryName && entry)
+      return success();
+    // One of the entries is missing.
+    if ((entry && !entryName) || (!entry && entryName)) {
+      emitError(kernelLaunch.getLoc(),
+                "one of the offloading entries is missing.");
+      return failure();
+    }
+  }
+
+  // For information on these entries see:
+  // https://clang.llvm.org/docs/OffloadingDesign.html#generating-offloading-entries
+  // Define and create useful variables.
+  auto context = builder.getContext();
+  auto loc = module.getLoc();
+  auto ptrType = LLVM::LLVMPointerType::get(context);
+  auto i32Type = builder.getI32Type();
+  auto i64Type = builder.getI64Type();
+
+  // Set the insertion point to the start of the module.
+  auto insertionGuard = ConversionPatternRewriter::InsertionGuard(builder);
+  builder.setInsertionPointToStart(&module.getRegion().front());
+
+  // Obtain the kernel name including the null terminator.
+  auto kernelName =
+      builder.getStringAttr(StringRef(name.data(), name.size() + 1));
+
+  // Create the kernel name offloading entry.
+  auto stringCnstType =
+      LLVMArrayType::get(IntegerType::get(context, 8), kernelName.size());
+  GlobalOp entryName = builder.create<GlobalOp>(
+      loc, /* Type */ stringCnstType, /* Constant */ true,
+      /* Name */ entryNameId,
+      /* Linkage */ Linkage::Internal, /* DSO local */ false,
+      /* Thread local */ false, /* Value */ kernelName,
+      /* Alignment */ nullptr, /* Address space */ 0,
+      /* Unnamed address */
+      UnnamedAddrAttr::get(context, UnnamedAddr::Global),
+      /* Section */ nullptr);
+
+  // Determine the offloading section kind.
+  auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
+      kernelLaunch, kernelLaunch.getKernelModuleName());
+  if (!module) {
+    emitError(kernelLaunch.getLoc(), "expected a kernel module");
+    return failure();
+  }
+  StringRef offloadSection;
+  if (auto attr = kernelModule->getAttr(gpu::getGpuOffloadKindAnnotation()))
+    if (auto strAttr = dyn_cast<StringAttr>(attr)) {
+      gpu::OffloadKind offloadKind = gpu::getOffloadKind(strAttr.getValue());
+      if (offloadKind == gpu::OffloadKind::cuda)
+        offloadSection = kCudaOffloadingSection;
+      else if (offloadKind == gpu::OffloadKind::hip)
+        offloadSection = kHipOffloadingSection;
+    }
+  if (offloadSection.empty()) {
+    emitError(kernelModule->getLoc(),
+              "the module doesn't contain a valid offloading kind.");
+    return failure();
+  }
+
+  // Create the offloading entry.
+  auto offloadEntryType = getOffloadEntryType(context);
+  GlobalOp offloadingEntry = builder.create<GlobalOp>(
+      loc, /* Type */ offloadEntryType, /* Constant */ true,
+      /* Name */ entryId,
+      /* Linkage */ Linkage::Weak, /* DSO local */ false,
+      /* Thread local */ false, /* Value */ nullptr,
+      /* Alignment */ builder.getIntegerAttr(i64Type, 1),
+      /* Address space */ 0,
+      /* Unnamed address */ nullptr,
+      /* Section */ builder.getStringAttr(offloadSection));
+
+  // Add an initializer to the global.
+  auto block = builder.createBlock(&offloadingEntry.getRegion());
+  builder.setInsertionPointToStart(block);
+
+  // Create an undef struct entry.
+  Value entryInit = builder.create<UndefOp>(loc, offloadEntryType).getRes();
+
+  // Insert the stub address to the offloading entry.
+  auto stubAddress = builder.create<AddressOfOp>(
+      loc, ptrType, getStubIdentifier(kernelLaunch));
+  entryInit = builder.create<InsertValueOp>(
+      entryInit.getLoc(), entryInit, stubAddress, llvm::ArrayRef<int64_t>{0});
+
+  // Insert the kernel name offloading entry to the offloading entry.
+  auto entryNameAddress =
+      builder.create<AddressOfOp>(loc, ptrType, entryName.getSymName());
+  entryInit = builder.create<InsertValueOp>(entryInit.getLoc(), entryInit,
+                                            entryNameAddress,
+                                            llvm::ArrayRef<int64_t>{1});
+
+  // Set struct fields according to:
+  // https://clang.llvm.org/docs/OffloadingDesign.html#generating-offloading-entries
+  auto c0I64 =
+      builder.create<ConstantOp>(loc, builder.getIntegerAttr(i64Type, 0));
+  entryInit = builder.create<InsertValueOp>(entryInit.getLoc(), entryInit,
+                                            c0I64, llvm::ArrayRef<int64_t>{2});
+  auto c0I32 =
+      builder.create<ConstantOp>(loc, builder.getIntegerAttr(i32Type, 0));
+  entryInit = builder.create<InsertValueOp>(entryInit.getLoc(), entryInit,
+                                            c0I32, llvm::ArrayRef<int64_t>{3});
+  entryInit = builder.create<InsertValueOp>(entryInit.getLoc(), entryInit,
+                                            c0I32, llvm::ArrayRef<int64_t>{4});
+  builder.create<ReturnOp>(loc, entryInit);
+  return success();
+}
+
+Value GPUOffloadBuilder::getOrCreateLaunchAddress(
+    gpu::LaunchFuncOp kernelLaunch, OpBuilder &builder) {
+  auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
+  auto stubId = createKernelStub(kernelLaunch, builder);
+  if (failed(insertOffloadEntry(kernelLaunch, builder)))
+    return {};
+  return builder.create<LLVM::AddressOfOp>(kernelLaunch.getLoc(), ptrType,
+                                           stubId);
+}
+
+LogicalResult
+GPUOffloadBuilder::createOffloadObject(ModuleOp op,
+                                       LLVMTypeConverter &converter) {
+  using namespace LLVM;
+  OpBuilder builder(op.getContext());
+  auto nativeIntType = builder.getIntegerType(converter.getPointerBitwidth());
+
+  // Collect all GPUModules.
+  SmallVector<gpu::GPUModuleOp> modules;
+  op.walk([&modules](gpu::GPUModuleOp op) { modules.push_back(op); });
+
+  // If there's no work finish without creating the object.
+  if (modules.empty())
+    return success();
+
+  // Concatenate all offloading entries.
+  SmallVector<char, 1024> binaryData;
+  llvm::raw_svector_ostream outputStream(binaryData);
+  for (auto module : modules) {
+    bool hasAnnotation = false;
+    if (auto attr = module->getAttr(gpu::getGpuOffloadObjectAnnotation())) {
+      if (auto bytecode = dyn_cast<StringAttr>(attr)) {
+        outputStream << bytecode.getValue();
+        hasAnnotation = true;
+      }
+    }
+    if (!hasAnnotation) {
+      module.emitError() << "the gpu.module doesn't contain an offload object.";
+      return failure();
+    }
+  }
+
+  // Set the insertion point to the start of the module.
+  builder.setInsertionPointToStart(op.getBody());
+  auto stringCnstType = LLVMArrayType::get(IntegerType::get(op.getContext(), 8),
+                                           binaryData.size());
+
+  // Create the offload section with all the binary annotations. TODO add
+  // !exclude metadata to this variable see:
+  // https://llvm.org/docs/LangRef.html#exclude-metadata
+  auto offloadObject = builder.create<GlobalOp>(
+      op.getLoc(), /* Type */ stringCnstType, /* Constant */ true,
+      /* Name */ kOffloadObjectGlobalId,
+      /* Linkage */ Linkage::Private, /* DSO local */ false,
+      /* Thread local */ false, /* Value */ builder.getStringAttr(binaryData),
+      /* Alignment */ builder.getIntegerAttr(nativeIntType, 8),
+      /* Address space */ 0,
+      /* Unnamed address */ nullptr,
+      /* Section */ builder.getStringAttr(kLLVMOffloadingSection));
+
+  // This second global is to prevent the offloadObject from being optimized
+  // away.
+  auto ptrType = LLVM::LLVMPointerType::get(op.getContext());
+  auto ptrArrayType = LLVMArrayType::get(ptrType, 1);
+  auto llvmMetadata = builder.create<GlobalOp>(
+      op.getLoc(), /* Type */ ptrArrayType, /* Constant */ false,
+      /* Name */ kCompilerUsedGlobalId,
+      /* Linkage */ Linkage::Appending, /* DSO local */ false,
+      /* Thread local */ false, /* Value */ nullptr,
+      /* Alignment */ nullptr,
+      /* Address space */ 0,
+      /* Unnamed address */ nullptr,
+      /* Section */ builder.getStringAttr(kLLVMMetadataSection));
+
+  // Insert the offloadObject to the llvmMetadata section to prevent being
+  // optimized away.
+  auto block = builder.createBlock(&llvmMetadata.getRegion());
+  builder.setInsertionPointToStart(block);
+  Value entryInit = builder.create<UndefOp>(op.getLoc(), ptrArrayType).getRes();
+  auto stubAddress = builder.create<AddressOfOp>(op.getLoc(), ptrType,
+                                                 offloadObject.getName());
+  entryInit = builder.create<InsertValueOp>(
+      entryInit.getLoc(), entryInit, stubAddress, llvm::ArrayRef<int64_t>{0});
+  builder.create<ReturnOp>(op.getLoc(), entryInit);
+  return success();
+}
+
 LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
                                          ArrayRef<Value> arguments) const {
   auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>();
@@ -759,9 +1129,10 @@
       LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers());
 }
 
-// Emits LLVM IR to launch a kernel function. Expects the module that contains
-// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
-// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
+// If llvmOffload is set to false in the pattern emits LLVM IR to launch a
+// kernel function. Expects the module that contains the compiled kernel
+// function as a cubin in the 'nvvm.cubin' attribute, or a hsaco in the
+// 'rocdl.hsaco' attribute of the kernel function in the IR.
 //
 // %0 = call %binarygetter
 // %1 = call %moduleLoad(%0)
@@ -774,8 +1145,20 @@
 // call %streamDestroy(%4)
 // call %moduleUnload(%1)
 //
+// When llvmOffload is set to true, this pattern emits LLVM IR to launch a
+// kernel function and expects LLVM offload annotations in the gpu.modules. In
+// this case the generated code is:
+//
+// %0 = call %streamCreate()
+// %1 = <see generateParamsArray>
+// %2 = <see GPUOffloadBuilder::getOrCreateLaunchAddress>
+// call %launchKernel(%2, <launchOp operands 0..5>, 0, %0, %1, nullptr)
+// call %streamSynchronize(%0)
+// call %streamDestroy(%0)
+//
 // If the op is async, the stream corresponds to the (single) async dependency
 // as well as the async token the op produces.
+//
 LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::LaunchFuncOp launchOp, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -795,34 +1178,45 @@
 
   Location loc = launchOp.getLoc();
 
-  // Create an LLVM global with CUBIN extracted from the kernel annotation and
-  // obtain a pointer to the first byte in it.
-  auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
-      launchOp, launchOp.getKernelModuleName());
-  assert(kernelModule && "expected a kernel module");
-
-  auto binaryAttr =
-      kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation);
-  if (!binaryAttr) {
-    kernelModule.emitOpError()
-        << "missing " << gpuBinaryAnnotation << " attribute";
-    return failure();
+  Value kernelAddress, moduleAddress;
+  // If llvmOffload is set to false, generate the traditional offload code.
+  if (!llvmOffload) {
+    // Create an LLVM global with CUBIN extracted from the kernel annotation and
+    // obtain a pointer to the first byte in it.
+    auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
+        launchOp, launchOp.getKernelModuleName());
+    assert(kernelModule && "expected a kernel module");
+
+    auto binaryAttr =
+        kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation);
+    if (!binaryAttr) {
+      kernelModule.emitOpError()
+          << "missing " << gpuBinaryAnnotation << " attribute";
+      return failure();
+    }
+    SmallString<128> nameBuffer(kernelModule.getName());
+    nameBuffer.append(kGpuBinaryStorageSuffix);
+    Value data = LLVM::createGlobalString(
+        loc, rewriter, nameBuffer.str(), binaryAttr.getValue(),
+        LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers());
+
+    moduleAddress =
+        moduleLoadCallBuilder.create(loc, rewriter, data).getResult();
+    // Get the function from the module. The name corresponds to the name of
+    // the kernel function.
+    auto kernelName = generateKernelNameConstant(
+        launchOp.getKernelModuleName().getValue(),
+        launchOp.getKernelName().getValue(), loc, rewriter);
+    kernelAddress = moduleGetFunctionCallBuilder
+                        .create(loc, rewriter, {moduleAddress, kernelName})
+                        .getResult();
+  } else {
+    kernelAddress =
+        GPUOffloadBuilder::getOrCreateLaunchAddress(launchOp, rewriter);
+    if (!kernelAddress)
+      return failure();
   }
 
-  SmallString<128> nameBuffer(kernelModule.getName());
-  nameBuffer.append(kGpuBinaryStorageSuffix);
-  Value data = LLVM::createGlobalString(
-      loc, rewriter, nameBuffer.str(), binaryAttr.getValue(),
-      LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers());
-
-  auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
-  // Get the function from the module. The name corresponds to the name of
-  // the kernel function.
-  auto kernelName = generateKernelNameConstant(
-      launchOp.getKernelModuleName().getValue(),
-      launchOp.getKernelName().getValue(), loc, rewriter);
-  auto function = moduleGetFunctionCallBuilder.create(
-      loc, rewriter, {module.getResult(), kernelName});
   Value zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type, 0);
   Value stream =
       adaptor.getAsyncDependencies().empty()
@@ -836,7 +1230,7 @@
                                       : zero;
   launchKernelCallBuilder.create(
       loc, rewriter,
-      {function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(),
+      {kernelAddress, adaptor.getGridSizeX(), adaptor.getGridSizeY(),
        adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
        adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams,
        /*extra=*/nullpointer});
@@ -852,7 +1246,8 @@
     streamDestroyCallBuilder.create(loc, rewriter, stream);
     rewriter.eraseOp(launchOp);
   }
-  moduleUnloadCallBuilder.create(loc, rewriter, module.getResult());
+  if (!llvmOffload)
+    moduleUnloadCallBuilder.create(loc, rewriter, moduleAddress);
 
   return success();
 }
@@ -978,6 +1373,30 @@
                ConvertWaitOpToGpuRuntimeCallPattern,
                ConvertAsyncYieldToGpuRuntimeCallPattern>(converter);
   patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
-      converter, gpuBinaryAnnotation, kernelBarePtrCallConv);
+      converter, gpuBinaryAnnotation, kernelBarePtrCallConv,
+      /* disable llvmOffload */ false);
+  patterns.add<EraseGpuModuleOpPattern>(&converter.getContext());
+}
+
+void mlir::populateGpuToLLVMOffloadConversionPatterns(
+    LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    bool kernelBarePtrCallConv) {
+  converter.addConversion([&converter](gpu::AsyncTokenType type) -> Type {
+    return converter.getPointerType(
+        IntegerType::get(&converter.getContext(), 8));
+  });
+  patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
+               ConvertDeallocOpToGpuRuntimeCallPattern,
+               ConvertHostRegisterOpToGpuRuntimeCallPattern,
+               ConvertHostUnregisterOpToGpuRuntimeCallPattern,
+               ConvertMemcpyOpToGpuRuntimeCallPattern,
+               ConvertMemsetOpToGpuRuntimeCallPattern,
+               ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern,
+               ConvertWaitAsyncOpToGpuRuntimeCallPattern,
+               ConvertWaitOpToGpuRuntimeCallPattern,
+               ConvertAsyncYieldToGpuRuntimeCallPattern>(converter);
+  patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
+      converter, gpu::getDefaultGpuBinaryAnnotation(), kernelBarePtrCallConv,
+      /* enable llvmOffload */ true);
   patterns.add<EraseGpuModuleOpPattern>(&converter.getContext());
 }
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -47,7 +47,9 @@
   Transforms/AllReduceLowering.cpp
   Transforms/AsyncRegionRewriter.cpp
   Transforms/KernelOutlining.cpp
+  Transforms/GpuToDeviceOffload.cpp
   Transforms/MemoryPromotion.cpp
+  Transforms/NameMangling.cpp
   Transforms/ParallelLoopMapper.cpp
   Transforms/SerializeToBlob.cpp
   Transforms/SerializeToCubin.cpp
@@ -88,6 +90,28 @@
 
 add_subdirectory(TransformOps)
 
+if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  if (NOT DEFINED CUDAToolkit_ROOT)
+    find_package(CUDAToolkit)
+    get_filename_component(CUDAToolkit_ROOT ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
+  endif()
+
+  # Enable gpu-to-nvptx pass.
+  target_compile_definitions(obj.MLIRGPUTransforms
+    PRIVATE
+    MLIR_GPU_TO_NVPTX_PASS_ENABLE=1
+    # This variable should be set to CUDAToolkit_LIBRARY_ROOT, however the
+    # variable is unset for recent cuda toolkits, see:
+    # https://gitlab.kitware.com/cmake/cmake/-/issues/24858
+    __DEFAULT_CUDATOOLKIT_PATH__="${CUDAToolkit_ROOT}"
+  )
+
+target_link_libraries(MLIRGPUTransforms
+  PRIVATE
+  MLIRNVVMToLLVMIRTranslation
+)
+endif()
+
 if(MLIR_ENABLE_CUDA_RUNNER)
   if(NOT MLIR_ENABLE_CUDA_CONVERSIONS)
     message(SEND_ERROR
@@ -127,16 +151,29 @@
 
 endif()
 
+if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
+  # Enable gpu-to-amdgpu pass.
+  set(ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
+  target_compile_definitions(obj.MLIRGPUTransforms
+    PRIVATE
+    MLIR_GPU_TO_AMDGPU_PASS_ENABLE=1
+    __DEFAULT_ROCM_PATH__="${ROCM_PATH}"
+  )
+
+  target_link_libraries(MLIRGPUTransforms
+    PRIVATE
+    MLIRROCDLToLLVMIRTranslation
+  )
+endif()
+
 if(MLIR_ENABLE_ROCM_CONVERSIONS)
   if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
     message(SEND_ERROR
       "Building mlir with ROCm support requires the AMDGPU backend")
   endif()
 
-  set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
   target_compile_definitions(obj.MLIRGPUTransforms
     PRIVATE
-    __DEFAULT_ROCM_PATH__="${DEFAULT_ROCM_PATH}"
     MLIR_GPU_TO_HSACO_PASS_ENABLE=1
   )
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/GpuToDeviceObjectCommon.h b/mlir/lib/Dialect/GPU/Transforms/GpuToDeviceObjectCommon.h
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/GpuToDeviceObjectCommon.h
@@ -0,0 +1,516 @@
+//===- GpuToDeviceObjectCommon.h - GPU to Device object utilities ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a series of utilities for transforming GPUModuleOps into
+// binary annotations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Object/OffloadBinary.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/TargetParser.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+
+namespace mlir {
+namespace gpu {
+// Mixin class for all GpuToDevice* passes.
+// This class needs to be listed as a friend class, as it will access protected
+// members.
+template <typename Derived>
+class GpuToDeviceOffloadMixin {
+private:
+  Derived &getDerived() { return static_cast<Derived &>(*this); }
+
+protected:
+  // Function interfaces to be implemented by the final class.
+  // Return the optimization level, -1 signifies don't run the optimization
+  // pipeline.
+  int getOptLevel() { return -1; }
+
+  // Hook for loading bitcode files, returns std::nullopt on failure.
+  std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+  loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) {
+    return SmallVector<std::unique_ptr<llvm::Module>>{};
+  }
+
+  // Hook for performing additional actions on a loaded bitcode file.
+  void handleBitcodeFile(llvm::Module &module) {}
+
+  // Hook for performing additional actions on the llvmModule pre linking.
+  void handleModulePreLink(llvm::Module &module) {}
+
+  // Hook for performing additional actions on the llvmModule post linking.
+  void handleModulePostLink(llvm::Module &module) {}
+
+protected:
+  // Create the target machine based on the target triple and chip.
+  std::unique_ptr<llvm::TargetMachine> createTargetMachine();
+
+  // Loads a bitcode file from path.
+  std::unique_ptr<llvm::Module> loadBitcodeFile(llvm::LLVMContext &context,
+                                                StringRef path);
+
+  // Loads multiple bitcode files.
+  LogicalResult
+  loadBitcodeFileList(llvm::LLVMContext &context,
+                      ArrayRef<std::string> fileList,
+                      SmallVector<std::unique_ptr<llvm::Module>> &llvmModules,
+                      bool failureOnError = true);
+
+  // Translates the gpu.module to LLVM IR.
+  std::unique_ptr<llvm::Module>
+  translateToLLVMIR(llvm::LLVMContext &llvmContext);
+
+  // Link the llvmModule to other bitcode file.
+  LogicalResult linkFiles(llvm::Module &module,
+                          SmallVector<std::unique_ptr<llvm::Module>> &&libs);
+
+  // Optimize the module.
+  LogicalResult optimizeModule(llvm::Module &module,
+                               llvm::TargetMachine &targetMachine,
+                               int optLevel = 3);
+
+  // Serializes the LLVM IR bitcode to a special object file described in:
+  // https://clang.llvm.org/docs/ClangOffloadPackager.html
+  SmallVector<char> serializeModuleToObject(llvm::Module &llvmModule,
+                                            gpu::OffloadKind offloadKind);
+
+  // Insert the binary annotation to the GPUModule.
+  void insertAnnotations(gpu::GPUModuleOp module,
+                         SmallVector<char> &binaryObject,
+                         gpu::OffloadKind offloadKind);
+
+  // Run the pass.
+  void run(gpu::OffloadKind offloadKind);
+};
+
+template <typename Derived>
+std::unique_ptr<llvm::TargetMachine>
+GpuToDeviceOffloadMixin<Derived>::createTargetMachine() {
+  auto &self = getDerived();
+  Location loc = self.getOperation().getLoc();
+  std::string error;
+
+  // Load the target.
+  const llvm::Target *target =
+      llvm::TargetRegistry::lookupTarget(self.triple, error);
+  if (!target) {
+    emitError(loc, Twine("failed to lookup target: ") + error);
+    return {};
+  }
+
+  // Create the target machine using the target.
+  llvm::TargetMachine *machine = target->createTargetMachine(
+      self.triple, self.chip, self.features, {}, {});
+  if (!machine) {
+    emitError(loc, "failed to create target machine");
+    return {};
+  }
+  return std::unique_ptr<llvm::TargetMachine>{machine};
+}
+
+template <typename Derived>
+std::unique_ptr<llvm::Module>
+GpuToDeviceOffloadMixin<Derived>::loadBitcodeFile(llvm::LLVMContext &context,
+                                                  StringRef path) {
+  auto &self = getDerived();
+  llvm::SMDiagnostic error;
+  std::unique_ptr<llvm::Module> library =
+      llvm::getLazyIRFileModule(path, error, context);
+  if (!library) {
+    self.getOperation().emitError() << "Failed to load file from " << path
+                                    << ", error: " << error.getMessage();
+    return nullptr;
+  }
+  return library;
+}
+
+template <typename Derived>
+LogicalResult GpuToDeviceOffloadMixin<Derived>::loadBitcodeFileList(
+    llvm::LLVMContext &context, ArrayRef<std::string> fileList,
+    SmallVector<std::unique_ptr<llvm::Module>> &llvmModules,
+    bool failureOnError) {
+  auto &self = getDerived();
+  for (const std::string &str : fileList) {
+    // Test if the path exists, if it doesn't abort.
+    StringRef pathRef = StringRef(str.data(), str.size());
+    if (!llvm::sys::fs::is_regular_file(pathRef)) {
+      self.getOperation().emitError()
+          << "File path: " << pathRef << " does not exist or is not a file.\n";
+      return failure();
+    }
+    // Load the file or abort on error.
+    if (auto bcFile = loadBitcodeFile(context, pathRef))
+      llvmModules.push_back(std::move(bcFile));
+    else if (failureOnError)
+      return failure();
+  }
+  return success();
+}
+
+template <typename Derived>
+std::unique_ptr<llvm::Module>
+GpuToDeviceOffloadMixin<Derived>::translateToLLVMIR(
+    llvm::LLVMContext &llvmContext) {
+  return translateModuleToLLVMIR(getDerived().getOperation(), llvmContext,
+                                 "LLVMDialectModule");
+}
+
+template <typename Derived>
+LogicalResult GpuToDeviceOffloadMixin<Derived>::linkFiles(
+    llvm::Module &module, SmallVector<std::unique_ptr<llvm::Module>> &&libs) {
+  auto &self = getDerived();
+  if (libs.empty())
+    return success();
+  llvm::Linker linker(module);
+  for (std::unique_ptr<llvm::Module> &libModule : libs) {
+    // This bitcode linking code is substantially similar to what is used in
+    // hip-clang It imports the library functions into the module, allowing LLVM
+    // optimization passes (which must run after linking) to optimize across the
+    // libraries and the module's code. We also only import symbols if they are
+    // referenced by the module or a previous library since there will be no
+    // other source of references to those symbols in this compilation and since
+    // we don't want to bloat the resulting code object.
+    bool err = linker.linkInModule(
+        std::move(libModule), llvm::Linker::Flags::LinkOnlyNeeded,
+        [](llvm::Module &m, const StringSet<> &gvs) {
+          llvm::internalizeModule(m, [&gvs](const llvm::GlobalValue &gv) {
+            return !gv.hasName() || (gvs.count(gv.getName()) == 0);
+          });
+        });
+    // True is linker failure
+    if (err) {
+      self.getOperation().emitError(
+          "unrecoverable failure during device library linking.");
+      // We have no guaranties about the state of `ret`, so bail
+      return failure();
+    }
+  }
+  return success();
+}
+
+template <typename Derived>
+LogicalResult GpuToDeviceOffloadMixin<Derived>::optimizeModule(
+    llvm::Module &module, llvm::TargetMachine &targetMachine, int optLevel) {
+  auto &self = getDerived();
+  if (optLevel < 0 || optLevel > 3)
+    return self.getOperation().emitError()
+           << "Invalid optimization level" << optLevel << "\n";
+
+  targetMachine.setOptLevel(static_cast<llvm::CodeGenOpt::Level>(optLevel));
+
+  auto transformer =
+      makeOptimizingTransformer(optLevel, /*sizeLevel=*/0, &targetMachine);
+  auto error = transformer(&module);
+  if (error) {
+    InFlightDiagnostic mlirError = self.getOperation()->emitError();
+    llvm::handleAllErrors(
+        std::move(error), [&mlirError](const llvm::ErrorInfoBase &ei) {
+          mlirError << "Could not optimize LLVM IR: " << ei.message() << "\n";
+        });
+    return mlirError;
+  }
+  return success();
+}
+
+template <typename Derived>
+SmallVector<char> GpuToDeviceOffloadMixin<Derived>::serializeModuleToObject(
+    llvm::Module &llvmModule, gpu::OffloadKind offloadKind) {
+  using namespace llvm;
+  using namespace llvm::object;
+  auto &self = getDerived();
+
+  // Set the offload kind.
+  llvm::object::OffloadKind offKind = OFK_None;
+  if (offloadKind == gpu::OffloadKind::cuda)
+    offKind = OFK_Cuda;
+  else if (offloadKind == gpu::OffloadKind::hip)
+    offKind = OFK_HIP;
+
+  SmallVector<char> offloadData;
+  {
+    // Create the offload object, for more information check:
+    // https://clang.llvm.org/docs/ClangOffloadPackager.html
+    OffloadBinary::OffloadingImage imageBinary{};
+    std::unique_ptr<MemoryBuffer> buffer;
+    // Add a scope to trash the binaryObject buffer as soon it's done being
+    // used.
+    {
+      SmallVector<char, 1024> binaryData;
+      // Write the LLVM module bitcode to a buffer.
+      raw_svector_ostream outputStream(binaryData);
+      WriteBitcodeToFile(llvmModule, outputStream);
+      imageBinary.TheImageKind = IMG_Bitcode;
+      imageBinary.TheOffloadKind = offKind;
+      imageBinary.StringData["triple"] = self.triple;
+      // Avoid setting the arch if no arch was given in the cmd, as clang will
+      // compile code only for this arch if set, so running the code on an
+      // incompatible arch will result in error.
+      if (!self.chip.isDefaultOption())
+        imageBinary.StringData["arch"] = self.chip;
+      imageBinary.Image = MemoryBuffer::getMemBuffer(
+          StringRef(binaryData.data(), binaryData.size()), "", false);
+      buffer = OffloadBinary::write(imageBinary);
+    }
+    // Check that the image was properly created. This step was taken from:
+    // https://github.com/llvm/llvm-project/blob/main/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
+    if (buffer->getBufferSize() % OffloadBinary::getAlignment() != 0) {
+      emitError(self.getOperation().getLoc(),
+                "Offload binary has an invalid size alignment");
+      return {};
+    }
+    // Write the buffer.
+    raw_svector_ostream outputStream(offloadData);
+    outputStream << buffer->getBuffer();
+  }
+  return offloadData;
+}
+
+template <typename Derived>
+void GpuToDeviceOffloadMixin<Derived>::insertAnnotations(
+    gpu::GPUModuleOp module, SmallVector<char> &binaryObject,
+    gpu::OffloadKind offloadKind) {
+  auto &self = getDerived();
+  // Create a pair of annotations to store the object and the offload kind.
+  module->setAttr(
+      gpu::getGpuOffloadObjectAnnotation(),
+      StringAttr::get(&self.getContext(),
+                      StringRef(binaryObject.data(), binaryObject.size())));
+  module->setAttr(
+      gpu::getGpuOffloadKindAnnotation(),
+      StringAttr::get(&self.getContext(), gpu::fromOffloadKind(offloadKind)));
+}
+
+template <typename Derived>
+void GpuToDeviceOffloadMixin<Derived>::run(gpu::OffloadKind offloadKind) {
+  auto &self = getDerived();
+  // Translate the GPUModule to LLVM IR.
+  llvm::LLVMContext llvmContext;
+  std::unique_ptr<llvm::Module> llvmModule =
+      self.translateToLLVMIR(llvmContext);
+  if (!llvmModule)
+    return self.signalPassFailure();
+
+  // Create the target machine.
+  std::unique_ptr<llvm::TargetMachine> targetMachine =
+      self.createTargetMachine();
+  if (!targetMachine)
+    return self.signalPassFailure();
+
+  // Set the data layout and target triple of the module.
+  llvmModule->setDataLayout(targetMachine->createDataLayout());
+  llvmModule->setTargetTriple(targetMachine->getTargetTriple().getTriple());
+
+  // Link bitcode files.
+  self.handleModulePreLink(*llvmModule);
+  {
+    auto libs = self.loadBitcodeFiles(llvmContext, *llvmModule);
+    if (!libs)
+      return self.signalPassFailure();
+    if (libs->size())
+      if (failed(self.linkFiles(*llvmModule, std::move(*libs))))
+        return self.signalPassFailure();
+    self.handleModulePostLink(*llvmModule);
+  }
+
+  // Optimize the module.
+  auto optLevel = self.getOptLevel();
+  if (optLevel != -1 &&
+      failed(self.optimizeModule(*llvmModule, *targetMachine, optLevel)))
+    return self.signalPassFailure();
+
+  // Serialize the LLVM Module to an object file.
+  auto binaryObject = self.serializeModuleToObject(*llvmModule, offloadKind);
+  auto op = self.getOperation();
+  if (binaryObject.empty()) {
+    emitError(op.getLoc(), "Failed to serialize to bitcode.");
+    return self.signalPassFailure();
+  }
+
+  // Insert the binary object to the module.
+  self.insertAnnotations(op, binaryObject, offloadKind);
+}
+
+// NVPTX specific mixin implementation of common functions for
+// `GpuToDeviceOffloadMixin`. This class needs to be listed as a friend class,
+// as it will access protected members.
+template <typename Derived>
+class GPUToNVPTXMixin {
+private:
+  Derived &getDerived() { return static_cast<Derived &>(*this); }
+
+protected:
+  using NVPTXBase = GPUToNVPTXMixin;
+
+  // Implementation of GpuToDeviceOffloadMixin::loadBitcodeFiles. It can be used
+  // in child classes by adding `using NVPTXBase::loadBitcodeFiles;`.
+  std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+  loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) {
+    auto &self = getDerived();
+    SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
+
+    // Try to load libdevice from a cuda installation.
+    StringRef pathRef(self.cudaPath.getValue());
+    if (pathRef.size()) {
+      SmallVector<char, 256> path;
+      path.insert(path.begin(), pathRef.begin(), pathRef.end());
+      pathRef = StringRef(path.data(), path.size());
+      if (!llvm::sys::fs::is_directory(pathRef)) {
+        self.getOperation().emitError()
+            << "CUDA path: " << pathRef
+            << " does not exist or is not a directory.\n";
+        return std::nullopt;
+      }
+      // TODO remove this hard coded path.
+      llvm::sys::path::append(path, "nvvm", "libdevice", "libdevice.10.bc");
+      pathRef = StringRef(path.data(), path.size());
+      if (!llvm::sys::fs::is_regular_file(pathRef)) {
+        self.getOperation().emitError()
+            << "LibDevice path: " << pathRef
+            << " does not exist or is not a file.\n";
+        return std::nullopt;
+      }
+      if (auto bcFile = self.loadBitcodeFile(context, pathRef))
+        bcFiles.push_back(std::move(bcFile));
+    }
+
+    // Add extra libraries.
+    if (failed(self.loadBitcodeFileList(context, self.bcPaths, bcFiles, true)))
+      return std::nullopt;
+    return bcFiles;
+  }
+};
+
+// AMDGPU specific mixin implementation of common functions for
+// `GpuToDeviceOffloadMixin`. This class needs to be listed as a friend class,
+// as it will access protected members.
+template <typename Derived>
+class GPUToAMDGPUMixin {
+private:
+  Derived &getDerived() { return static_cast<Derived &>(*this); }
+
+protected:
+  using AMDGPUBase = GPUToAMDGPUMixin;
+
+  // Get the paths of ROCm device libraries. Function adapted from:
+  // https://github.com/llvm/llvm-project/blob/main/clang/lib/Driver/ToolChains/AMDGPU.cpp
+  void getCommonBitcodeLibs(llvm::SmallVector<std::string, 12> &libs,
+                            SmallVector<char, 256> &libPath,
+                            StringRef isaVersion, bool wave64, bool daz,
+                            bool finiteOnly, bool unsafeMath, bool fastMath,
+                            bool correctSqrt, StringRef abiVer) {
+    auto &self = getDerived();
+    auto addLib = [&](StringRef path) {
+      if (!llvm::sys::fs::is_regular_file(path)) {
+        self.getOperation().emitRemark()
+            << "Bitcode library path: " << path
+            << " does not exist or is not a file.\n";
+        return;
+      }
+      libs.push_back(path.str());
+    };
+    auto optLib = [](StringRef name, bool on) -> Twine {
+      return name + (on ? "_on" : "_off");
+    };
+    auto getLibPath = [&libPath](Twine lib) {
+      auto baseSize = libPath.size();
+      llvm::sys::path::append(libPath, lib + ".bc");
+      std::string path(StringRef(libPath.data(), libPath.size()).str());
+      libPath.truncate(baseSize);
+      return path;
+    };
+
+    // Add ROCm device libraries.
+    addLib(getLibPath("ocml"));
+    addLib(getLibPath("ockl"));
+    addLib(getLibPath(optLib("oclc_daz_opt", daz)));
+    addLib(getLibPath(optLib("oclc_unsafe_math", unsafeMath || fastMath)));
+    addLib(getLibPath(optLib("oclc_finite_only", finiteOnly || fastMath)));
+    addLib(getLibPath(optLib("oclc_correctly_rounded_sqrt", correctSqrt)));
+    addLib(getLibPath(optLib("oclc_wavefrontsize64", wave64)));
+    addLib(getLibPath("oclc_isa_version_" + isaVersion));
+    if (abiVer.size())
+      addLib(getLibPath("oclc_abi_version_" + abiVer));
+  }
+
+  // Implementation of GpuToDeviceOffloadMixin::loadBitcodeFiles. It can be used
+  // in child classes by adding `using AMDGPUBase::loadBitcodeFiles;`.
+  std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+  loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) {
+    SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
+    auto &self = getDerived();
+    SmallVector<std::string, 12> libsPath;
+
+    // Try to load device libraries from a ROCm installation.
+    StringRef pathRef(self.rocmPath.getValue());
+    if (pathRef.size()) {
+      SmallVector<char, 256> path;
+      path.insert(path.begin(), pathRef.begin(), pathRef.end());
+      // TODO remove this hard coded ROCm path.
+      llvm::sys::path::append(path, "amdgcn", "bitcode");
+      pathRef = StringRef(path.data(), path.size());
+      if (!llvm::sys::fs::is_directory(pathRef)) {
+        self.getOperation().emitRemark()
+            << "ROCm amdgcn bitcode path: " << pathRef
+            << " does not exist or is not a directory\n";
+        return std::nullopt;
+      }
+      std::string isaVersion;
+      auto isaVer = llvm::AMDGPU::getIsaVersion(self.chip);
+      if (isaVer.Major != 0)
+        isaVersion = std::to_string(isaVer.Major) +
+                     std::to_string(isaVer.Minor) +
+                     llvm::utohexstr(isaVer.Stepping, /*lower case*/ true);
+      getCommonBitcodeLibs(libsPath, path, isaVersion, self.wave64, self.daz,
+                           self.finiteOnly, self.unsafeMath, self.fastMath,
+                           self.correctSqrt, self.abiVer);
+    } else
+      libsPath.reserve(libsPath.size() + self.bcPaths.size());
+
+    libsPath.insert(libsPath.end(), self.bcPaths.begin(), self.bcPaths.end());
+
+    // Add extra libraries.
+    if (failed(self.loadBitcodeFileList(context, libsPath, bcFiles, true)))
+      return std::nullopt;
+    return bcFiles;
+  }
+
+  // Implementation of GpuToDeviceOffloadMixin::handleBitcodeFile. It can be
+  // used in child classes by adding `using AMDGPUBase::handleBitcodeFile;`.
+  void handleBitcodeFile(llvm::Module &module) {
+    // Some ROCM builds don't strip this like they should
+    if (auto *openclVersion = module.getNamedMetadata("opencl.ocl.version"))
+      module.eraseNamedMetadata(openclVersion);
+    // Stop spamming us with clang version numbers
+    if (auto *ident = module.getNamedMetadata("llvm.ident"))
+      module.eraseNamedMetadata(ident);
+  }
+};
+} // namespace gpu
+} // namespace mlir
diff --git a/mlir/lib/Dialect/GPU/Transforms/GpuToDeviceOffload.cpp b/mlir/lib/Dialect/GPU/Transforms/GpuToDeviceOffload.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/GpuToDeviceOffload.cpp
@@ -0,0 +1,195 @@
+//===- GpuToDeviceOffload.cpp - Impl. of GPU to NVPTX & AMDGPU passes -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GpuToNVPTXOffload & GpuToAMDGPUOffload passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GpuToDeviceObjectCommon.h"
+
+using namespace mlir;
+
+namespace mlir {
+namespace gpu {
+StringRef getDefaultCudaToolkitPath() {
+#ifdef __DEFAULT_CUDATOOLKIT_PATH__
+  return __DEFAULT_CUDATOOLKIT_PATH__;
+#else
+  return "";
+#endif
+}
+
+StringRef getDefaultRocmPath() {
+#ifdef __DEFAULT_ROCM_PATH__
+  return __DEFAULT_ROCM_PATH__;
+#else
+  return "";
+#endif
+}
+
+StringRef getGpuOffloadObjectAnnotation() { return "llvm.offload.object"; }
+
+StringRef getGpuOffloadKindAnnotation() { return "llvm.offload.kind"; }
+} // namespace gpu
+
+#define GEN_PASS_DEF_GPUTONVPTXOFFLOAD
+#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
+#define GEN_PASS_DEF_GPUTOAMDGPUOFFLOAD
+#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
+} // namespace mlir
+
+#ifdef MLIR_GPU_TO_NVPTX_PASS_ENABLE
+#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
+namespace {
+// NVPTX target initializer.
+struct InitNVPTXTarget {
+  InitNVPTXTarget() {
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXAsmPrinter();
+  }
+};
+
+// This ensures that the target is initialized once.
+llvm::ManagedStatic<InitNVPTXTarget> nvptxTargetInit;
+// CUDA specific Offload pass
+class GpuToNVPTXOffload
+    : public impl::GpuToNVPTXOffloadBase<GpuToNVPTXOffload>,
+      public gpu::GpuToDeviceOffloadMixin<GpuToNVPTXOffload>,
+      public gpu::GPUToNVPTXMixin<GpuToNVPTXOffload> {
+private:
+  template <typename Derived>
+  friend class ::mlir::gpu::GpuToDeviceOffloadMixin;
+  template <typename Derived>
+  friend class ::mlir::gpu::GPUToNVPTXMixin;
+
+public:
+  using Base::Base;
+  using NVPTXBase::loadBitcodeFiles;
+
+  // Initialize the NVPTX target.
+  LogicalResult initialize(MLIRContext *context) override;
+
+  // Add LLVM IR dialect translations to the registry.
+  void getDependentDialects(DialectRegistry &registry) const override;
+
+  void runOnOperation() final;
+
+protected:
+  // Return the optimization level, -1 signifies don't run the optimization
+  // pipeline.
+  int getOptLevel() { return optLevel.getValue(); }
+};
+} // namespace
+
+LogicalResult GpuToNVPTXOffload::initialize(MLIRContext *context) {
+  *nvptxTargetInit;
+  return success();
+}
+
+void GpuToNVPTXOffload::getDependentDialects(DialectRegistry &registry) const {
+  impl::GpuToNVPTXOffloadBase<GpuToNVPTXOffload>::getDependentDialects(
+      registry);
+  registerGPUDialectTranslation(registry);
+  registerLLVMDialectTranslation(registry);
+  registerNVVMDialectTranslation(registry);
+}
+
+void GpuToNVPTXOffload::runOnOperation() { run(gpu::OffloadKind::cuda); }
+#else
+namespace {
+class GpuToNVPTXOffload
+    : public impl::GpuToNVPTXOffloadBase<GpuToNVPTXOffload> {
+public:
+  using impl::GpuToNVPTXOffloadBase<GpuToNVPTXOffload>::GpuToNVPTXOffloadBase;
+
+  void runOnOperation() final {
+    getOperation().emitError()
+        << "This pass requires the NVPTX target but it wasn't built.";
+    return signalPassFailure();
+  }
+};
+} // namespace
+#endif
+
+#ifdef MLIR_GPU_TO_AMDGPU_PASS_ENABLE
+#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
+namespace {
+// AMDGPU target initializer.
+struct InitAMDGPUTarget {
+  InitAMDGPUTarget() {
+    LLVMInitializeAMDGPUTarget();
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUTargetMC();
+    LLVMInitializeAMDGPUAsmPrinter();
+  }
+};
+// This ensures that the target is initialized once.
+llvm::ManagedStatic<InitAMDGPUTarget> amdgpuTargetInit;
+
+// AMDGPU specific Offload pass
+class GpuToAMDGPUOffload
+    : public impl::GpuToAMDGPUOffloadBase<GpuToAMDGPUOffload>,
+      public gpu::GpuToDeviceOffloadMixin<GpuToAMDGPUOffload>,
+      public gpu::GPUToAMDGPUMixin<GpuToAMDGPUOffload> {
+private:
+  template <typename Derived>
+  friend class ::mlir::gpu::GpuToDeviceOffloadMixin;
+  template <typename Derived>
+  friend class ::mlir::gpu::GPUToAMDGPUMixin;
+
+public:
+  using AMDGPUBase::handleBitcodeFile;
+  using AMDGPUBase::loadBitcodeFiles;
+  using Base::Base;
+  // Initialize the AMDGPU target.
+  LogicalResult initialize(MLIRContext *context) override;
+
+  // Add LLVM IR dialect translations to the registry.
+  void getDependentDialects(DialectRegistry &registry) const override;
+
+  void runOnOperation() final;
+
+protected:
+  // Return the optimization level, -1 signifies don't run the optimization
+  // pipeline.
+  int getOptLevel() { return optLevel.getValue(); }
+};
+} // namespace
+
+LogicalResult GpuToAMDGPUOffload::initialize(MLIRContext *context) {
+  *amdgpuTargetInit;
+  return success();
+}
+
+void GpuToAMDGPUOffload::getDependentDialects(DialectRegistry &registry) const {
+  impl::GpuToAMDGPUOffloadBase<GpuToAMDGPUOffload>::getDependentDialects(
+      registry);
+  registerGPUDialectTranslation(registry);
+  registerLLVMDialectTranslation(registry);
+  registerROCDLDialectTranslation(registry);
+}
+
+void GpuToAMDGPUOffload::runOnOperation() { run(gpu::OffloadKind::hip); }
+#else
+namespace {
+class GpuToAMDGPUOffload
+    : public impl::GpuToAMDGPUOffloadBase<GpuToAMDGPUOffload> {
+public:
+  using impl::GpuToAMDGPUOffloadBase<
+      GpuToAMDGPUOffload>::GpuToAMDGPUOffloadBase;
+
+  void runOnOperation() final {
+    getOperation().emitError()
+        << "This pass requires the AMDGPU target but it wasn't built.";
+    return signalPassFailure();
+  }
+};
+} // namespace
+#endif
diff --git a/mlir/lib/Dialect/GPU/Transforms/NameMangling.cpp b/mlir/lib/Dialect/GPU/Transforms/NameMangling.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/NameMangling.cpp
@@ -0,0 +1,125 @@
+//===- NameMangling.cpp - Implementation of GPU symbols mangling ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GPU dialect name mangling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace mlir {
+#define GEN_PASS_DEF_GPUNAMEMANGLING
+#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
+} // namespace mlir
+
+namespace {
+// Mangle the names of all the top symbols inside a GPUModuleOp from symbol to
+// "__G<module name>_S<symbol name>", for all GPUModuleOps in a module.
+class GpuNameMangling : public impl::GpuNameManglingBase<GpuNameMangling> {
+public:
+  using Base::Base;
+
+  // Get the mangled name for the symbol.
+  StringAttr getMangledName(StringAttr moduleName, StringAttr symbol);
+
+  // Mangle all the definitions inside a particular GPUModuleOp.
+  LogicalResult mangleNamesInModule(gpu::GPUModuleOp module);
+
+  // Update all the symbol uses of a particular symbol inside the top module.
+  // `symbolUses` is the range of symbol uses of the gpu.module name in the top
+  // module symbol table.
+  void updateSymbolUses(SymbolTable::UseRange &&symbolUses);
+
+  void runOnOperation() final;
+};
+} // namespace
+
+StringAttr GpuNameMangling::getMangledName(StringAttr moduleName,
+                                           StringAttr symbol) {
+  std::string name = "__G" + moduleName.str() + "_S" + symbol.str();
+  return StringAttr::get(&getContext(), name);
+}
+
+LogicalResult GpuNameMangling::mangleNamesInModule(gpu::GPUModuleOp gpuModule) {
+  SymbolTable synbolTable(gpuModule);
+  for (auto &op : gpuModule.getBody()->getOperations()) {
+    // Ignore external functions.
+    if (auto fn = dyn_cast<FunctionOpInterface>(op))
+      if (fn.isExternal())
+        continue;
+    if (auto symbol = dyn_cast<SymbolOpInterface>(op)) {
+      auto mangledName =
+          getMangledName(gpuModule.getNameAttr(), symbol.getNameAttr());
+
+      // Replace all the symbol uses of `symbol` to its mangled name.
+      if (failed(synbolTable.replaceAllSymbolUses(
+              symbol.getNameAttr(), mangledName, &gpuModule.getRegion()))) {
+        emitError(op.getLoc(), "Failed to replace the symbol name.");
+        return failure();
+      }
+
+      // On symbol replacement success rename the symbol.
+      synbolTable.setSymbolName(symbol, mangledName);
+    }
+  }
+  return success();
+}
+
+void GpuNameMangling::updateSymbolUses(SymbolTable::UseRange &&symbolUses) {
+  // All symbolUses correspond to a particular gpu.module name.
+  for (auto symbolUse : symbolUses) {
+    Operation *operation = symbolUse.getUser();
+    SmallVector<std::pair<StringAttr, SymbolRefAttr>> symbolReferences;
+
+    // Collect all references to the `symbol` in the attributes of the
+    // operation.
+    for (auto opAttr : operation->getAttrs()) {
+      if (auto symbol = dyn_cast<SymbolRefAttr>(opAttr.getValue()))
+        if (symbol == symbolUse.getSymbolRef())
+          symbolReferences.push_back({opAttr.getName(), symbol});
+    }
+
+    // Update the symbol references.
+    for (auto &[attrName, symbol] : symbolReferences) {
+      auto nestedReferences = symbol.getNestedReferences();
+      if (nestedReferences.size()) {
+        SmallVector<FlatSymbolRefAttr> updatedReferences(nestedReferences);
+        // Only the first nested reference was updated by the previous step,
+        // thus we just update that one.
+        updatedReferences[0] = FlatSymbolRefAttr::get(getMangledName(
+            symbol.getRootReference(), nestedReferences[0].getRootReference()));
+        operation->setAttr(
+            attrName,
+            SymbolRefAttr::get(symbol.getRootReference(), updatedReferences));
+      }
+    }
+  }
+}
+
+void GpuNameMangling::runOnOperation() {
+  auto module = getOperation();
+  SmallVector<gpu::GPUModuleOp> gpuModules;
+  // Collect all gpu.modules.
+  module.walk([&gpuModules](gpu::GPUModuleOp op) { gpuModules.push_back(op); });
+  SymbolTable moduleTable(module);
+
+  // Mangle the names.
+  for (auto gpuModule : gpuModules) {
+    if (failed(mangleNamesInModule(gpuModule)))
+      return signalPassFailure();
+    if (auto symbolUses = moduleTable.getSymbolUses(gpuModule.getNameAttr(),
+                                                    &module.getRegion()))
+      updateSymbolUses(std::move(*symbolUses));
+  }
+}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -207,6 +207,31 @@
     )
   endif()
 
+  if(MLIR_ENABLE_CUDART_RUNNER)
+    find_package(CUDAToolkit REQUIRED)
+
+    add_mlir_library(mlir_cudart_runtime
+      SHARED
+      CudaRuntimeWrappers.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+    )
+    set_property(TARGET mlir_cudart_runtime PROPERTY CXX_STANDARD 14)
+    target_include_directories(mlir_cudart_runtime
+      PRIVATE
+      ${CUDAToolkit_INCLUDE_DIRS}
+    )
+    target_link_libraries(mlir_cudart_runtime
+      PRIVATE
+      CUDA::cudart
+      CUDA::cuda_driver
+    )
+    target_compile_definitions(mlir_cudart_runtime
+      PRIVATE
+      MLIR_USE_CUDART_RUNNER=1
+    )
+  endif()
+
   if(MLIR_ENABLE_ROCM_RUNNER)
     # Configure ROCm support.
     if (NOT DEFINED ROCM_PATH)
@@ -253,6 +278,12 @@
 
       EXCLUDE_FROM_LIBMLIR
     )
+    add_mlir_library(mlir_rocmrt_runtime
+      SHARED
+      RocmRuntimeWrappers.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+    )
 
     # Supress compiler warnings from HIP headers
     check_cxx_compiler_flag(-Wno-c++98-compat-extra-semi
@@ -260,32 +291,53 @@
     if (CXX_SUPPORTS_CXX98_COMPAT_EXTRA_SEMI_FLAG)
       target_compile_options(mlir_rocm_runtime PRIVATE
         "-Wno-c++98-compat-extra-semi")
+      target_compile_options(mlir_rocmrt_runtime PRIVATE
+      "-Wno-c++98-compat-extra-semi")
     endif()
     check_cxx_compiler_flag(-Wno-return-type-c-linkage
         CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
     if (CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG)
       target_compile_options(mlir_rocm_runtime PRIVATE
         "-Wno-return-type-c-linkage")
+      target_compile_options(mlir_rocmrt_runtime PRIVATE
+        "-Wno-return-type-c-linkage")
     endif()
     check_cxx_compiler_flag(-Wno-nested-anon-types
       CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG)
     if (CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG)
       target_compile_options(mlir_rocm_runtime PRIVATE
         "-Wno-nested-anon-types")
+      target_compile_options(mlir_rocmrt_runtime PRIVATE
+        "-Wno-nested-anon-types")
     endif()
     check_cxx_compiler_flag(-Wno-gnu-anonymous-struct
       CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
     if (CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
       target_compile_options(mlir_rocm_runtime PRIVATE
       "-Wno-gnu-anonymous-struct")
+      target_compile_options(mlir_rocmrt_runtime PRIVATE
+      "-Wno-gnu-anonymous-struct")
     endif()
 
     set_property(TARGET mlir_rocm_runtime
       PROPERTY INSTALL_RPATH_USE_LINK_PATH ON)
 
+    set_property(TARGET mlir_rocmrt_runtime
+      PROPERTY INSTALL_RPATH_USE_LINK_PATH ON)
+
     target_link_libraries(mlir_rocm_runtime
       PUBLIC
       hip::host hip::amdhip64
     )
+
+    target_link_libraries(mlir_rocmrt_runtime
+      PUBLIC
+      hip::host hip::amdhip64
+    )
+
+    target_compile_definitions(mlir_rocmrt_runtime
+      PRIVATE
+      MLIR_USE_HIPRT_RUNNER=1
+    )
   endif()
 endif()
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -16,15 +16,43 @@
 
 #include <stdio.h>
 
+// Most of these functions should be interoperable:
+// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DRIVER.html
+// However to avoid any pitfalls in context management, the runtime api is used
+// when available.
+
+#ifdef MLIR_USE_CUDART_RUNNER
+#include "cuda_runtime.h"
+#endif
 #include "cuda.h"
 
+#ifdef MLIR_USE_CUDART_RUNNER
+#define kernel_t void *
+#define stream_t cudaStream_t
+#define event_t cudaEvent_t
+#else
+#define kernel_t CUfunction
+#define stream_t CUstream
+#define event_t CUevent
+#endif
+
 #ifdef _WIN32
 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
 #else
 #define MLIR_CUDA_WRAPPERS_EXPORT
 #endif // _WIN32
 
+#ifdef MLIR_USE_CUDART_RUNNER
 #define CUDA_REPORT_IF_ERROR(expr)                                             \
+  [](cudaError_t result) {                                                     \
+    if (!result)                                                               \
+      return;                                                                  \
+    const char *name = cudaGetErrorName(result);                               \
+    if (!name)                                                                 \
+      name = "<unknown>";                                                      \
+    fprintf(stderr, "'%s' failed with '%s'\n", #expr, name);                   \
+  }(expr)
+#define CUDA_DRIVER_REPORT_IF_ERROR(expr)                                      \
   [](CUresult result) {                                                        \
     if (!result)                                                               \
       return;                                                                  \
@@ -35,6 +63,19 @@
     fprintf(stderr, "'%s' failed with '%s'\n", #expr, name);                   \
   }(expr)
 
+#else
+#define CUDA_REPORT_IF_ERROR(expr)                                             \
+  [](CUresult result) {                                                        \
+    if (!result)                                                               \
+      return;                                                                  \
+    const char *name = nullptr;                                                \
+    cuGetErrorName(result, &name);                                             \
+    if (!name)                                                                 \
+      name = "<unknown>";                                                      \
+    fprintf(stderr, "'%s' failed with '%s'\n", #expr, name);                   \
+  }(expr)
+#define CUDA_DRIVER_REPORT_IF_ERROR(expr) CUDA_REPORT_IF_ERROR(expr)
+
 thread_local static int32_t defaultDevice = 0;
 
 // Make the primary context of the current default device current for the
@@ -60,7 +101,9 @@
 
   ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
 };
+#endif
 
+#ifndef MLIR_USE_CUDART_RUNNER
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
   ScopedContext scopedContext;
   CUmodule module = nullptr;
@@ -78,84 +121,145 @@
   CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name));
   return function;
 }
+#endif
 
 // The wrapper uses intptr_t instead of CUDA's unsigned int to match
 // the type of MLIR's index type. This avoids the need for casts in the
 // generated MLIR code.
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
+mgpuLaunchKernel(kernel_t function, intptr_t gridX, intptr_t gridY,
                  intptr_t gridZ, intptr_t blockX, intptr_t blockY,
-                 intptr_t blockZ, int32_t smem, CUstream stream, void **params,
+                 intptr_t blockZ, int32_t smem, stream_t stream, void **params,
                  void **extra) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  CUDA_REPORT_IF_ERROR(cudaLaunchKernel(function, dim3(gridX, gridY, gridZ),
+                                        dim3(blockX, blockY, blockZ), params,
+                                        smem, stream));
+#else
   ScopedContext scopedContext;
   CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,
                                       blockY, blockZ, smem, stream, params,
                                       extra));
+#endif
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() {
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT stream_t mgpuStreamCreate() {
+#ifdef MLIR_USE_CUDART_RUNNER
+  cudaStream_t stream;
+  CUDA_REPORT_IF_ERROR(cudaStreamCreate(&stream));
+  return stream;
+#else
   ScopedContext scopedContext;
-  CUstream stream = nullptr;
+  stream_t stream = nullptr;
   CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
   return stream;
+#endif
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) {
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(stream_t stream) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  CUDA_REPORT_IF_ERROR(cudaStreamDestroy(stream));
+#else
   CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream));
+#endif
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuStreamSynchronize(CUstream stream) {
+mgpuStreamSynchronize(stream_t stream) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  CUDA_REPORT_IF_ERROR(cudaStreamSynchronize(stream));
+#else
   CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream));
+#endif
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream,
-                                                              CUevent event) {
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(stream_t stream,
+                                                              event_t event) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  CUDA_REPORT_IF_ERROR(cudaStreamWaitEvent(stream, event, /*flags=*/0));
+#else
   CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /*flags=*/0));
+#endif
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() {
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT event_t mgpuEventCreate() {
+#ifdef MLIR_USE_CUDART_RUNNER
+  cudaEvent_t event;
+  CUDA_REPORT_IF_ERROR(cudaEventCreate(&event));
+  return event;
+#else
   ScopedContext scopedContext;
-  CUevent event = nullptr;
+  event_t event = nullptr;
   CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
   return event;
+#endif
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) {
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(event_t event) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  CUDA_REPORT_IF_ERROR(cudaEventDestroy(event));
+#else
   CUDA_REPORT_IF_ERROR(cuEventDestroy(event));
+#endif
 }
 
-extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventSynchronize(CUevent event) {
+extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventSynchronize(event_t event) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  CUDA_REPORT_IF_ERROR(cudaEventSynchronize(event));
+#else
   CUDA_REPORT_IF_ERROR(cuEventSynchronize(event));
+#endif
 }
 
-extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event,
-                                                          CUstream stream) {
+extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(event_t event,
+                                                          stream_t stream) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  CUDA_REPORT_IF_ERROR(cudaEventRecord(event, stream));
+#else
   CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
+#endif
 }
 
-extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) {
+extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, stream_t /*stream*/) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  void *ptr;
+  CUDA_REPORT_IF_ERROR(cudaMalloc(&ptr, sizeBytes));
+  return ptr;
+#else
   ScopedContext scopedContext;
   CUdeviceptr ptr;
   CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
   return reinterpret_cast<void *>(ptr);
+#endif
 }
 
-extern "C" void mgpuMemFree(void *ptr, CUstream /*stream*/) {
+extern "C" void mgpuMemFree(void *ptr, stream_t /*stream*/) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  CUDA_REPORT_IF_ERROR(cudaFree(ptr));
+#else
   CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));
+#endif
 }
 
 extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes,
-                           CUstream stream) {
+                           stream_t stream) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  CUDA_REPORT_IF_ERROR(
+      cudaMemcpyAsync(dst, src, sizeBytes, cudaMemcpyDefault, stream));
+#else
   CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dst),
                                      reinterpret_cast<CUdeviceptr>(src),
                                      sizeBytes, stream));
+#endif
 }
 
 extern "C" void mgpuMemset32(void *dst, unsigned int value, size_t count,
-                             CUstream stream) {
-  CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast<CUdeviceptr>(dst),
-                                        value, count, stream));
+                             stream_t stream) {
+  // There's no cuda runtime equivalent for this specific function, but it
+  // should be interoperable:
+  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DRIVER.html
+  CUDA_DRIVER_REPORT_IF_ERROR(cuMemsetD32Async(
+      reinterpret_cast<CUdeviceptr>(dst), value, count, stream));
 }
 
 /// Helper functions for writing mlir example code
@@ -164,8 +268,12 @@
 // transfer functions implemented.
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  CUDA_REPORT_IF_ERROR(cudaHostRegister(ptr, sizeBytes, /*flags=*/0));
+#else
   ScopedContext scopedContext;
   CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0));
+#endif
 }
 
 /// Registers a memref with the CUDA runtime. `descriptor` is a pointer to a
@@ -194,8 +302,12 @@
 
 // Allows to unregister byte array with the CUDA runtime.
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  CUDA_REPORT_IF_ERROR(cudaHostUnregister(ptr));
+#else
   ScopedContext scopedContext;
   CUDA_REPORT_IF_ERROR(cuMemHostUnregister(ptr));
+#endif
 }
 
 /// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a
@@ -209,5 +321,9 @@
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
+#ifdef MLIR_USE_CUDART_RUNNER
+  CUDA_REPORT_IF_ERROR(cudaSetDevice(device));
+#else
   defaultDevice = device;
+#endif
 }
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -52,15 +52,27 @@
 // The wrapper uses intptr_t instead of ROCM's unsigned int to match
 // the type of MLIR's index type. This avoids the need for casts in the
 // generated MLIR code.
-extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX,
+
+#ifdef MLIR_USE_HIPRT_RUNNER
+#define kernel_t void *
+#else
+#define kernel_t hipFunction_t
+#endif
+extern "C" void mgpuLaunchKernel(kernel_t function, intptr_t gridX,
                                  intptr_t gridY, intptr_t gridZ,
                                  intptr_t blockX, intptr_t blockY,
                                  intptr_t blockZ, int32_t smem,
                                  hipStream_t stream, void **params,
                                  void **extra) {
+#ifdef MLIR_USE_HIPRT_RUNNER
+  HIP_REPORT_IF_ERROR(hipLaunchKernel(function, dim3(gridX, gridY, gridZ),
+                                      dim3(blockX, blockY, blockZ), params,
+                                      smem, stream));
+#else
   HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ,
                                             blockX, blockY, blockZ, smem,
                                             stream, params, extra));
+#endif
 }
 
 extern "C" hipStream_t mgpuStreamCreate() {
diff --git a/mlir/test/Conversion/GPUCommon/offload.mlir b/mlir/test/Conversion/GPUCommon/offload.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Conversion/GPUCommon/offload.mlir
@@ -0,0 +1,129 @@
+// RUN: mlir-opt --gpu-to-offload --split-input-file -verify-diagnostics %s | FileCheck %s
+
+// Perform a comprehensive test of the offload annotations.
+module attributes {gpu.container_module} {
+  // CHECK: llvm.mlir.global internal unnamed_addr constant @".omp_offloading.entry_name.[[KERNEL_ID:.*]]"("[[KERNEL_NAME:.*]]\00") {addr_space = 0 : i32}
+  // CHECK-NEXT: llvm.mlir.global weak constant @".omp_offloading.entry.[[KERNEL_ID]]"() 
+  // CHECK-SAME: {addr_space = 0 : i32, alignment = 1 : i64, section = "hip_offloading_entries"} : !llvm.struct<[[STRUCT_BODY:.*]]> {
+  // CHECK-NEXT: %[[V0:.*]] = llvm.mlir.undef : !llvm.struct<[[STRUCT_BODY]]>
+  // CHECK-NEXT: %[[V1:.*]] = llvm.mlir.addressof @[[KERNEL_ID]]_stub : !llvm.ptr
+  // CHECK-NEXT: %[[V2:.*]] = llvm.insertvalue %[[V1]], %[[V0]][0] : !llvm.struct<[[STRUCT_BODY]]>
+  // CHECK-NEXT: %[[V3:.*]] = llvm.mlir.addressof @".omp_offloading.entry_name.[[KERNEL_ID]]" : !llvm.ptr
+  // CHECK-NEXT: %[[V4:.*]] = llvm.insertvalue %[[V3]], %[[V2]][1] : !llvm.struct<[[STRUCT_BODY]]>
+  // CHECK-NEXT: %[[V5:.*]] = llvm.mlir.constant(0 : i64) : i64
+  // CHECK-NEXT: %[[V6:.*]] = llvm.insertvalue %[[V5]], %[[V4]][2] : !llvm.struct<[[STRUCT_BODY]]>
+  // CHECK-NEXT: %[[V7:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK-NEXT: %[[V8:.*]] = llvm.insertvalue %[[V7]], %[[V6]][3] : !llvm.struct<[[STRUCT_BODY]]>
+  // CHECK-NEXT: %[[V9:.*]] = llvm.insertvalue %[[V7]], %[[V8]][4] : !llvm.struct<[[STRUCT_BODY]]>
+  // CHECK-NEXT: llvm.return %[[V9]] : !llvm.struct<[[STRUCT_BODY]]>
+  // CHECK-NEXT: }
+  // CHECK: llvm.func @[[KERNEL_ID]]_stub() attributes {dso_local} {
+  // CHECK-NEXT:   llvm.return
+  // CHECK-NEXT: }
+  // CHECK: llvm.mlir.global private constant @[[OBJECT_LABEL:.*]]("\10\FF\10\AD") {addr_space = 0 : i32, alignment = 8 : i64, section = ".llvm.offloading"}
+  // CHECK-NEXT: llvm.mlir.global appending @llvm.compiler.used() {addr_space = 0 : i32, section = "llvm.metadata"} : !llvm.array<1 x ptr> {
+  // CHECK-NEXT: %[[V0:.*]] = llvm.mlir.undef : !llvm.array<1 x ptr>
+  // CHECK-NEXT: %[[V1:.*]] = llvm.mlir.addressof @[[OBJECT_LABEL]] : !llvm.ptr
+  // CHECK-NEXT: %[[COMPILER_USED_GLOBAL:.*]] = llvm.insertvalue %[[V1]], %[[V0]][0] : !llvm.array<1 x ptr>
+  // CHECK-NEXT: llvm.return %[[COMPILER_USED_GLOBAL]] : !llvm.array<1 x ptr>
+  // CHECK-NEXT: }
+  // CHECK: llvm.func @host_function() {
+    // CHECK: %[[KERNEL:.*]] = llvm.mlir.addressof @[[KERNEL_ID]]_stub : !llvm.ptr
+    // CHECK: llvm.call @mgpuLaunchKernel(%[[KERNEL]], {{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
+    // CHECK-NEXT: llvm.call @mgpuStreamSynchronize({{.*}}) : (!llvm.ptr) -> ()
+    // CHECK-NEXT: llvm.call @mgpuStreamDestroy({{.*}}) : (!llvm.ptr) -> ()
+    // CHECK-NEXT: llvm.return
+  // CHECK-NEXT: }
+  // CHECK: llvm.func @mgpuStreamCreate() -> !llvm.ptr
+  // CHECK: llvm.func @mgpuLaunchKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr)
+  // CHECK: llvm.func @mgpuStreamSynchronize(!llvm.ptr)
+  // CHECK: llvm.func @mgpuStreamDestroy(!llvm.ptr)
+  func.func @host_function() {
+    %c0 = arith.constant 0 : i32
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    gpu.launch_func @device_module::@llvm_kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%c0: i32)
+    return
+  }
+  gpu.module @device_module attributes {llvm.offload.kind = "hip", llvm.offload.object = "\10\FF\10\AD"} {
+    llvm.func @llvm_kernel(%arg0: i32) attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+}
+
+// -----
+
+// Test multiple modules.
+module attributes {gpu.container_module} {
+  // CHECK: llvm.mlir.global internal unnamed_addr constant @".omp_offloading.entry_name.[[CUDA_KERNEL_ID:.*]]"("kernel\00") {addr_space = 0 : i32}
+  // CHECK-NEXT: llvm.mlir.global weak constant @".omp_offloading.entry.[[CUDA_KERNEL_ID]]"() 
+  // CHECK-SAME: {addr_space = 0 : i32, alignment = 1 : i64, section = "cuda_offloading_entries"} : !llvm.struct<[[STRUCT_BODY:.*]]> {
+  // CHECK: llvm.func @[[CUDA_KERNEL_ID]]_stub() attributes {dso_local} {
+  // CHECK: llvm.mlir.global internal unnamed_addr constant @".omp_offloading.entry_name.[[HIP_KERNEL_ID:.*]]"("kernel\00") {addr_space = 0 : i32}
+  // CHECK-NEXT: llvm.mlir.global weak constant @".omp_offloading.entry.[[HIP_KERNEL_ID]]"() 
+  // CHECK-SAME: {addr_space = 0 : i32, alignment = 1 : i64, section = "hip_offloading_entries"} : !llvm.struct<[[STRUCT_BODY:.*]]> {
+  // CHECK: llvm.func @[[HIP_KERNEL_ID]]_stub() attributes {dso_local} {
+  // CHECK: llvm.mlir.global private constant @[[OBJECT_LABEL:.*]]("HIP_BLOBCUDA_BLOB") {addr_space = 0 : i32, alignment = 8 : i64, section = ".llvm.offloading"}
+  func.func @host_function() {
+    %c0 = arith.constant 0 : i32
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    // CHECK: %[[HIP_KERNEL:.*]] = llvm.mlir.addressof @[[HIP_KERNEL_ID]]_stub : !llvm.ptr
+    // CHECK: llvm.call @mgpuLaunchKernel(%[[HIP_KERNEL]], {{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
+    gpu.launch_func @hip_module::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%c0: i32)
+    // CHECK: %[[CUDA_KERNEL:.*]] = llvm.mlir.addressof @[[CUDA_KERNEL_ID]]_stub : !llvm.ptr
+    // CHECK: llvm.call @mgpuLaunchKernel(%[[CUDA_KERNEL]], {{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
+    gpu.launch_func @cuda_module::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%c0: i32)
+    return
+  }
+  gpu.module @hip_module attributes {llvm.offload.kind = "hip", llvm.offload.object = "HIP_BLOB"} {
+    llvm.func @kernel(%arg0: i32) attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+  gpu.module @cuda_module attributes {llvm.offload.kind = "cuda", llvm.offload.object = "CUDA_BLOB"} {
+    llvm.func @kernel(%arg0: i32) attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+}
+
+// -----
+
+// Test an invalid module with no offload object attribute.
+module attributes {gpu.container_module} {
+  func.func @host_function() {
+    %c0 = arith.constant 0 : i32
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    gpu.launch_func @hip_module::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%c0: i32)
+    return
+  }
+  // expected-error@+1 {{the gpu.module doesn't contain an offload object}}
+  gpu.module @hip_module attributes {llvm.offload.kind = "hip"} {
+    llvm.func @kernel(%arg0: i32) attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+}
+
+// -----
+
+// Test an invalid module with no offload kind attribute.
+module attributes {gpu.container_module} {
+  func.func @host_function() {
+    %c0 = arith.constant 0 : i32
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    // expected-error@+1 {{failed to legalize operation 'gpu.launch_func'}}
+    gpu.launch_func @hip_module::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%c0: i32)
+    return
+  }
+  // expected-error@+1 {{the module doesn't contain a valid offloading kind}}
+  gpu.module @hip_module attributes {llvm.offload.object = "HIP_BLOB"} {
+    llvm.func @kernel(%arg0: i32) attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/GPU/mangle-names.mlir b/mlir/test/Dialect/GPU/mangle-names.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/GPU/mangle-names.mlir
@@ -0,0 +1,96 @@
+// RUN: mlir-opt --gpu-name-mangling --split-input-file -verify-diagnostics %s | FileCheck %s
+
+// Verify that only the symbols *defined* inside the gpu.module are mangled,
+// and that all the symbol references are updated accordingly.
+module attributes {gpu.container_module} {
+  // CHECK-LABEL: func.func @bar
+  func.func @bar() {
+    return
+  }
+
+  // CHECK-LABEL: func.func @host_function
+  func.func @host_function() {
+    %c0 = arith.constant 0 : i32
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    // CHECK: call @bar
+    func.call @bar(): () -> ()
+    // CHECK: gpu.launch_func @device_module::@__Gdevice_module_Skernel
+    gpu.launch_func @device_module::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args()
+    // CHECK: gpu.launch_func @device_module::@__Gdevice_module_Sllvm_kernel
+    gpu.launch_func @device_module::@llvm_kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%c0: i32)
+    return
+  }
+
+  // CHECK-LABEL: gpu.module @device_module
+  gpu.module @device_module {
+    // CHECK-LABEL: func.func private @foo
+    func.func private @foo()
+    // CHECK-LABEL: func.func @__Gdevice_module_Sbar
+    func.func @bar() {
+      return
+    }
+    // CHECK-LABEL: gpu.func @__Gdevice_module_Skernel
+    gpu.func @kernel() kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>} {
+      // CHECK: call @__Gdevice_module_Sbar
+      func.call @bar(): () -> ()
+      gpu.return
+    }
+    // CHECK-LABEL: llvm.func @__Gdevice_module_Sllvm_kernel
+    llvm.func @llvm_kernel(%arg0: i32) attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+}
+
+// -----
+
+// Test name mangling with multiple modules.
+module attributes {gpu.container_module} {
+  // CHECK-LABEL: func.func @host
+  func.func @host() {
+    %c0 = arith.constant 0 : i32
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    // CHECK: gpu.launch_func @device_module_1::@__Gdevice_module_1_Skernel
+    gpu.launch_func @device_module_1::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args()
+    // CHECK: gpu.launch_func @device_module_1::@__Gdevice_module_1_Skernel_bar
+    gpu.launch_func @device_module_1::@kernel_bar blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args()
+    // CHECK: gpu.launch_func @device_module_2::@__Gdevice_module_2_Skernel
+    gpu.launch_func @device_module_2::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args()
+    return
+  }
+
+  // CHECK-LABEL: gpu.module @device_module_1
+  gpu.module @device_module_1 {
+    // CHECK-LABEL: func.func @__Gdevice_module_1_Sbar
+    func.func @bar() {
+      return
+    }
+    // CHECK-LABEL: gpu.func @__Gdevice_module_1_Skernel
+    gpu.func @kernel() kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>} {
+      // CHECK: call @__Gdevice_module_1_Sbar
+      func.call @bar(): () -> ()
+      gpu.return
+    }
+    // CHECK-LABEL: gpu.func @__Gdevice_module_1_Skernel_bar
+    gpu.func @kernel_bar() kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>} {
+      gpu.return
+    }
+  }
+
+  // CHECK-LABEL: gpu.module @device_module_2
+  gpu.module @device_module_2 {
+    // CHECK-LABEL: func.func @__Gdevice_module_2_Sbar
+    func.func @bar() {
+      return
+    }
+    // CHECK-LABEL: gpu.func @__Gdevice_module_2_Skernel
+    gpu.func @kernel() kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>} {
+      // CHECK: call @__Gdevice_module_2_Sbar
+      func.call @bar(): () -> ()
+      gpu.return
+    }
+  }
+}
+
diff --git a/mlir/test/Dialect/GPU/serialize-to-amdgpu.mlir b/mlir/test/Dialect/GPU/serialize-to-amdgpu.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/GPU/serialize-to-amdgpu.mlir
@@ -0,0 +1,64 @@
+// RUN: mlir-opt --gpu-to-amdgpu --split-input-file -verify-diagnostics %s | FileCheck %s
+
+// CHECK-LABEL: gpu.module @kernel
+// CHECK-SAME: llvm.offload.kind = "hip"
+// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD
+gpu.module @kernel {
+  llvm.func @kernel() attributes {gpu.kernel, rocdl.kernel, rocdl.reqd_work_group_size = array<i32: 128, 1, 1>} {
+    llvm.return
+  }
+}
+
+// -----
+
+// Test an empty module
+// CHECK-LABEL: gpu.module @empty_module
+// CHECK-SAME: llvm.offload.kind = "hip"
+// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD
+gpu.module @empty_module {
+}
+
+// -----
+
+// Test multiple modules
+// CHECK-LABEL: gpu.module @module_1
+// CHECK-SAME: llvm.offload.kind = "hip"
+// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD
+gpu.module @module_1 {
+  llvm.func @kernel() attributes {gpu.kernel, rocdl.kernel, rocdl.reqd_work_group_size = array<i32: 128, 1, 1>} {
+    llvm.return
+  }
+}
+
+// CHECK-LABEL: gpu.module @module_2
+// CHECK-SAME: llvm.offload.kind = "hip"
+// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD
+gpu.module @module_2 {
+  llvm.func @bar(i32) -> f64
+  llvm.func @kernel() attributes {gpu.kernel, rocdl.kernel, rocdl.reqd_work_group_size = array<i32: 128, 1, 1>} {
+    llvm.return
+  }
+}
+
+// CHECK-LABEL: gpu.module @module_3
+// CHECK-SAME: llvm.offload.kind = "hip"
+// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD
+gpu.module @module_3 {
+  llvm.func @kernel_1() attributes {gpu.kernel, rocdl.kernel, rocdl.reqd_work_group_size = array<i32: 128, 1, 1>} {
+    llvm.return
+  }
+  llvm.func @kernel_2() attributes {gpu.kernel, rocdl.kernel, rocdl.reqd_work_group_size = array<i32: 128, 1, 1>} {
+    llvm.return
+  }
+}
+
+// -----
+
+// Test a kernel with an invalid instruction.
+gpu.module @kernel_invalid {
+  llvm.func @kernel() attributes {gpu.kernel, rocdl.kernel, rocdl.reqd_work_group_size = array<i32: 128, 1, 1>} {
+    // expected-error@+1 {{cannot be converted to LLVM IR: missing `LLVMTranslationDialectInterface`}}
+    %0 = nvvm.read.ptx.sreg.ctaid.x : i32
+    llvm.return
+  }
+}
diff --git a/mlir/test/Dialect/GPU/serialize-to-nvptx.mlir b/mlir/test/Dialect/GPU/serialize-to-nvptx.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/GPU/serialize-to-nvptx.mlir
@@ -0,0 +1,64 @@
+// RUN: mlir-opt --gpu-to-nvptx --split-input-file -verify-diagnostics %s | FileCheck %s
+
+// CHECK-LABEL: gpu.module @kernel
+// CHECK-SAME: llvm.offload.kind = "cuda"
+// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD
+gpu.module @kernel {
+  llvm.func @kernel() attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel} {
+    llvm.return
+  }
+}
+
+// -----
+
+// Test an empty module
+// CHECK-LABEL: gpu.module @empty_module
+// CHECK-SAME: llvm.offload.kind = "cuda"
+// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD
+gpu.module @empty_module {
+}
+
+// -----
+
+// Test multiple modules
+// CHECK-LABEL: gpu.module @module_1
+// CHECK-SAME: llvm.offload.kind = "cuda"
+// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD
+gpu.module @module_1 {
+  llvm.func @kernel() attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel} {
+    llvm.return
+  }
+}
+
+// CHECK-LABEL: gpu.module @module_2
+// CHECK-SAME: llvm.offload.kind = "cuda"
+// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD
+gpu.module @module_2 {
+  llvm.func @bar(i32) -> f64
+  llvm.func @kernel() attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel} {
+    llvm.return
+  }
+}
+
+// CHECK-LABEL: gpu.module @module_3
+// CHECK-SAME: llvm.offload.kind = "cuda"
+// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD
+gpu.module @module_3 {
+  llvm.func @kernel_1() attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel} {
+    llvm.return
+  }
+  llvm.func @kernel_2() attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel} {
+    llvm.return
+  }
+}
+
+// -----
+
+// Test a kernel with an invalid instruction.
+gpu.module @kernel_invalid {
+  llvm.func @kernel() attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel} {
+    // expected-error@+1 {{cannot be converted to LLVM IR: missing `LLVMTranslationDialectInterface`}}
+    %0 = rocdl.workgroup.id.x : i32
+    llvm.return
+  }
+}
\ No newline at end of file