diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt
--- a/mlir/lib/Target/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt
@@ -57,6 +57,8 @@
   MLIROpenACCToLLVMIRTranslation
   MLIROpenMPToLLVMIRTranslation
   MLIRROCDLToLLVMIRTranslation
+  MLIRNVVMTarget
+  MLIRROCDLTarget
   )
 
 add_mlir_translation_library(MLIRTargetLLVMIRImport
diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
--- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
@@ -13,6 +13,8 @@
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/Target/LLVM/NVVM/Target.h"
+#include "mlir/Target/LLVM/ROCDL/Target.h"
 #include "mlir/Target/LLVMIR/Dialect/All.h"
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Tools/mlir-translate/Translation.h"
@@ -36,6 +38,8 @@
       },
       [](DialectRegistry &registry) {
         registry.insert<DLTIDialect, func::FuncDialect>();
+        registerNVVMTarget(registry);
+        registerROCDLTarget(registry);
         registerAllToLLVMIRTranslations(registry);
       });
 }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp
--- a/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp
@@ -12,10 +12,28 @@
 #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
 
 namespace {
+LogicalResult launchKernel(gpu::LaunchFuncOp launchOp,
+                           llvm::IRBuilderBase &builder,
+                           LLVM::ModuleTranslation &moduleTranslation) {
+  auto kernelBinary = SymbolTable::lookupNearestSymbolFrom<gpu::BinaryOp>(
+      launchOp, launchOp.getKernelModuleName());
+  if (!kernelBinary) {
+    launchOp.emitError("Couldn't find the binary holding the kernel: ")
+        << launchOp.getKernelModuleName();
+    return failure();
+  }
+  auto offloadingHandler =
+      dyn_cast<gpu::OffloadingLLVMTranslationAttrInterface>(
+          kernelBinary.getOffloadingHandlerAttr());
+  assert(offloadingHandler && "Invalid offloading handler.");
+  return offloadingHandler.launchKernel(launchOp, kernelBinary, builder,
+                                        moduleTranslation);
+}
 
 class GPUDialectLLVMIRTranslationInterface
     : public LLVMTranslationDialectInterface {
@@ -23,9 +41,23 @@
   using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface;
 
   LogicalResult
-  convertOperation(Operation *op, llvm::IRBuilderBase &builder,
+  convertOperation(Operation *operation, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) const override {
-    return isa<gpu::GPUModuleOp>(op) ? success() : failure();
+    return llvm::TypeSwitch<Operation *, LogicalResult>(operation)
+        .Case([&](gpu::GPUModuleOp) { return success(); })
+        .Case([&](gpu::BinaryOp op) {
+          auto offloadingHandler =
+              dyn_cast<gpu::OffloadingLLVMTranslationAttrInterface>(
+                  op.getOffloadingHandlerAttr());
+          assert(offloadingHandler && "Invalid offloading handler.");
+          return offloadingHandler.embedBinary(op, builder, moduleTranslation);
+        })
+        .Case([&](gpu::LaunchFuncOp op) {
+          return launchKernel(op, builder, moduleTranslation);
+        })
+        .Default([&](Operation *op) {
+          return op->emitError("unsupported GPU operation: ") << op->getName();
+        });
   }
 };
 
diff --git a/mlir/test/Target/LLVMIR/gpu.mlir b/mlir/test/Target/LLVMIR/gpu.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/gpu.mlir
@@ -0,0 +1,77 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// Checking the translation of the `gpu.binary` & `gpu.launch_fun` ops.
+module attributes {gpu.container_module} {
+  // CHECK: [[ARGS_TY:%.*]] = type { i32, i32 }
+  // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
+  // CHECK: @kernel_module_kernel_kernel_name = private unnamed_addr constant [7 x i8] c"kernel\00", align 1
+  gpu.binary @kernel_module  [#gpu.object<#nvvm.target, "BLOB">]
+  llvm.func @foo() {
+    // CHECK: [[ARGS:%.*]] = alloca %{{.*}}, align 8
+    // CHECK: [[ARGS_ARRAY:%.*]] = alloca ptr, i64 2, align 8
+    // CHECK: [[ARG0:%.*]] = getelementptr inbounds [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 0
+    // CHECK: store i32 32, ptr [[ARG0]], align 4
+    // CHECK: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 0
+    // CHECK: store ptr [[ARG0]], ptr %{{.*}}, align 8
+    // CHECK: [[ARG1:%.*]] = getelementptr inbounds [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 1
+    // CHECK: store i32 32, ptr [[ARG1]], align 4
+    // CHECK: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 1
+    // CHECK: store ptr [[ARG1]], ptr %{{.*}}, align 8
+    // CHECK: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst)
+    // CHECK: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name)
+    // CHECK: [[STREAM:%.*]] = call ptr @mgpuStreamCreate()
+    // CHECK: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 256, ptr [[STREAM]], ptr [[ARGS_ARRAY]], ptr null)
+    // CHECK: call void @mgpuStreamSynchronize(ptr [[STREAM]])
+    // CHECK: call void @mgpuStreamDestroy(ptr [[STREAM]])
+    // CHECK: call void @mgpuModuleUnload(ptr [[MODULE]])
+    %0 = llvm.mlir.constant(8 : index) : i64
+    %1 = llvm.mlir.constant(32 : i32) : i32
+    %2 = llvm.mlir.constant(256 : i32) : i32
+    gpu.launch_func @kernel_module::@kernel blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %2 args(%1 : i32, %1 : i32)
+    llvm.return
+  }
+}
+
+// -----
+
+// Checking the correct selection of the second object using an index as a selector.
+module {
+  // CHECK: @kernel_module_bin_cst = internal constant [1 x i8] c"1", align 8
+  gpu.binary @kernel_module <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "0">, #gpu.object<#nvvm.target, "1">]
+}
+
+// -----
+
+// Checking the correct selection of the second object using a target as a selector.
+module {
+  // CHECK: @kernel_module_bin_cst = internal constant [6 x i8] c"AMDGPU", align 8
+  gpu.binary @kernel_module <#gpu.select_object<#rocdl.target>> [#gpu.object<#nvvm.target, "NVPTX">, #gpu.object<#rocdl.target, "AMDGPU">]
+}
+
+// -----
+
+// Checking the translation of `gpu.launch_fun` with an async dependency.
+module attributes {gpu.container_module} {
+  // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
+  gpu.binary @kernel_module  [#gpu.object<#rocdl.target, "BLOB">]
+  llvm.func @foo() {
+    %0 = llvm.mlir.constant(8 : index) : i64
+    // CHECK: = call ptr @mgpuStreamCreate()
+    // CHECK-NEXT: = alloca {{.*}}, align 8
+    // CHECK-NEXT: [[ARGS:%.*]] = alloca ptr, i64 0, align 8
+    // CHECK-NEXT: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst)
+    // CHECK-NEXT: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name)
+    // CHECK-NEXT: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 0, ptr {{.*}}, ptr [[ARGS]], ptr null)
+    // CHECK-NEXT: call void @mgpuModuleUnload(ptr [[MODULE]])
+    // CHECK-NEXT: call void @mgpuStreamSynchronize(ptr %{{.*}})
+    // CHECK-NEXT: call void @mgpuStreamDestroy(ptr %{{.*}})
+    %1 = llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+    gpu.launch_func <%1 : !llvm.ptr> @kernel_module::@kernel blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64
+    llvm.call @mgpuStreamSynchronize(%1) : (!llvm.ptr) -> ()
+    llvm.call @mgpuStreamDestroy(%1) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+  llvm.func @mgpuStreamCreate() -> !llvm.ptr
+  llvm.func @mgpuStreamSynchronize(!llvm.ptr)
+  llvm.func @mgpuStreamDestroy(!llvm.ptr)
+}