diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -41,6 +41,7 @@
 add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ROCM_CONVERSIONS_ENABLED})
 
 set(MLIR_CUDA_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir CUDA runner")
+set(MLIR_ROCM_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir ROCm runner")
 set(MLIR_VULKAN_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir Vulkan runner")
 
 option(MLIR_INCLUDE_TESTS
diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -43,7 +43,8 @@
 /// instead uses a small wrapper library that exports a stable and conveniently
 /// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP).
 std::unique_ptr<OperationPass<ModuleOp>>
-createConvertGpuLaunchFuncToGpuRuntimeCallsPass();
+createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
+    StringRef gpuBinaryAnnotation = "");
 
 /// Creates a pass to convert kernel functions into GPU target object blobs.
 ///
diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
--- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
+++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
@@ -123,6 +123,11 @@
   void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
 
 public:
+  GpuLaunchFuncToGpuRuntimeCallsPass() = default;
+  GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) {
+    this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
+  }
+
   // Run the dialect converter on the module.
   void runOnOperation() override {
     // Cache the LLVMDialect for the current module.
@@ -457,6 +462,8 @@
 }
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass() {
-  return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>();
+mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
+    StringRef gpuBinaryAnnotation) {
+  return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>(
+      gpuBinaryAnnotation);
 }
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -13,8 +13,9 @@
 set(MLIR_RUNNER_UTILS_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 
 # Passed to lit.site.cfg.py.in to set up the path where to find the libraries
-# for the mlir cuda runner tests.
+# for the mlir cuda / rocm / vulkan runner tests.
 set(MLIR_CUDA_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+set(MLIR_ROCM_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 set(MLIR_VULKAN_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 
 configure_lit_site_cfg(
@@ -64,6 +65,12 @@
   )
 endif()
 
+if(MLIR_ROCM_RUNNER_ENABLED)
+  list(APPEND MLIR_TEST_DEPENDS
+    mlir-rocm-runner
+  )
+endif()
+
 if(MLIR_VULKAN_RUNNER_ENABLED)
   list(APPEND MLIR_TEST_DEPENDS
     mlir-vulkan-runner
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -70,6 +70,7 @@
     ToolSubst('%cuda_wrapper_library_dir', config.cuda_wrapper_library_dir, unresolved='ignore'),
     ToolSubst('%linalg_test_lib_dir', config.linalg_test_lib_dir, unresolved='ignore'),
     ToolSubst('%mlir_runner_utils_dir', config.mlir_runner_utils_dir, unresolved='ignore'),
+    ToolSubst('%rocm_wrapper_library_dir', config.rocm_wrapper_library_dir, unresolved='ignore'),
     ToolSubst('%vulkan_wrapper_library_dir', config.vulkan_wrapper_library_dir, unresolved='ignore')
 ])
 
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -39,6 +39,8 @@
 config.cuda_wrapper_library_dir = "@MLIR_CUDA_WRAPPER_LIBRARY_DIR@"
 config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@
 config.run_rocm_tests = @MLIR_ROCM_CONVERSIONS_ENABLED@
+config.rocm_wrapper_library_dir = "@MLIR_ROCM_WRAPPER_LIBRARY_DIR@"
+config.enable_rocm_runner = @MLIR_ROCM_RUNNER_ENABLED@
 config.vulkan_wrapper_library_dir = "@MLIR_VULKAN_WRAPPER_LIBRARY_DIR@"
 config.enable_vulkan_runner = @MLIR_VULKAN_RUNNER_ENABLED@
 
diff --git a/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir b/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir
@@ -0,0 +1,32 @@
+// RUN: mlir-rocm-runner %s --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s
+
+func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) {
+  %cst = constant 1 : index
+  %cst2 = dim %arg1, 0 : memref<?xf32>
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) {
+    store %arg0, %arg1[%tx] : memref<?xf32>
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK: [1, 1, 1, 1, 1]
+func @main() {
+  %arg0 = alloc() : memref<5xf32>
+  %21 = constant 5 : i32
+  %22 = memref_cast %arg0 : memref<5xf32> to memref<?xf32>
+  %cast = memref_cast %22 : memref<?xf32> to memref<*xf32>
+  call @mgpuMemHostRegisterFloat(%cast) : (memref<*xf32>) -> ()
+  %23 = memref_cast %22 : memref<?xf32> to memref<*xf32>
+  call @print_memref_f32(%23) : (memref<*xf32>) -> ()
+  %24 = constant 1.0 : f32
+  %25 = call @mgpuMemGetDeviceMemRef1dFloat(%22) : (memref<?xf32>) -> (memref<?xf32>)
+  call @other_func(%24, %25) : (f32, memref<?xf32>) -> ()
+  call @print_memref_f32(%23) : (memref<*xf32>) -> ()
+  return
+}
+
+func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
+func @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref<?xf32>) -> (memref<?xf32>)
+func @print_memref_f32(%ptr : memref<*xf32>)
diff --git a/mlir/test/mlir-rocm-runner/lit.local.cfg b/mlir/test/mlir-rocm-runner/lit.local.cfg
new file mode 100644
--- /dev/null
+++ b/mlir/test/mlir-rocm-runner/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.enable_rocm_runner:
+  config.unsupported = True
diff --git a/mlir/test/mlir-rocm-runner/two-modules.mlir b/mlir/test/mlir-rocm-runner/two-modules.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/mlir-rocm-runner/two-modules.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-rocm-runner %s --print-ir-after-all --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s --dump-input=always
+
+// CHECK: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+func @main() {
+  %arg = alloc() : memref<13xi32>
+  %dst = memref_cast %arg : memref<13xi32> to memref<?xi32>
+  %one = constant 1 : index
+  %sx = dim %dst, 0 : memref<?xi32>
+  %cast_dst = memref_cast %dst : memref<?xi32> to memref<*xi32>
+  call @mgpuMemHostRegisterInt32(%cast_dst) : (memref<*xi32>) -> ()
+  %dst_device = call @mgpuMemGetDeviceMemRef1dInt32(%dst) : (memref<?xi32>) -> (memref<?xi32>)
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
+    %t0 = index_cast %tx : index to i32
+    store %t0, %dst_device[%tx] : memref<?xi32>
+    gpu.terminator
+  }
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
+             threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
+    %t0 = index_cast %tx : index to i32
+    store %t0, %dst_device[%tx] : memref<?xi32>
+    gpu.terminator
+  }
+  call @print_memref_i32(%cast_dst) : (memref<*xi32>) -> ()
+  return
+}
+
+func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>)
+func @mgpuMemGetDeviceMemRef1dInt32(%ptr : memref<?xi32>) -> (memref<?xi32>)
+func @print_memref_i32(%ptr : memref<*xi32>)
diff --git a/mlir/test/mlir-rocm-runner/vecadd.mlir b/mlir/test/mlir-rocm-runner/vecadd.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/mlir-rocm-runner/vecadd.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-rocm-runner %s --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s
+
+func @vecadd(%arg0 : memref<?xf32>, %arg1 : memref<?xf32>, %arg2 : memref<?xf32>) {
+  %cst = constant 1 : index
+  %cst2 = dim %arg0, 0 : memref<?xf32>
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) {
+    %a = load %arg0[%tx] : memref<?xf32>
+    %b = load %arg1[%tx] : memref<?xf32>
+    %c = addf %a, %b : f32
+    store %c, %arg2[%tx] : memref<?xf32>
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK: [2.46, 2.46, 2.46, 2.46, 2.46]
+func @main() {
+  %0 = alloc() : memref<5xf32>
+  %1 = alloc() : memref<5xf32>
+  %2 = alloc() : memref<5xf32>
+  %3 = memref_cast %0 : memref<5xf32> to memref<?xf32>
+  %4 = memref_cast %1 : memref<5xf32> to memref<?xf32>
+  %5 = memref_cast %2 : memref<5xf32> to memref<?xf32>
+  %6 = memref_cast %3 : memref<?xf32> to memref<*xf32>
+  %7 = memref_cast %4 : memref<?xf32> to memref<*xf32>
+  %8 = memref_cast %5 : memref<?xf32> to memref<*xf32>
+  call @mgpuMemHostRegisterFloat(%6) : (memref<*xf32>) -> ()
+  call @mgpuMemHostRegisterFloat(%7) : (memref<*xf32>) -> ()
+  call @mgpuMemHostRegisterFloat(%8) : (memref<*xf32>) -> ()
+  %9 = call @mgpuMemGetDeviceMemRef1dFloat(%3) : (memref<?xf32>) -> (memref<?xf32>)
+  %10 = call @mgpuMemGetDeviceMemRef1dFloat(%4) : (memref<?xf32>) -> (memref<?xf32>)
+  %11 = call @mgpuMemGetDeviceMemRef1dFloat(%5) : (memref<?xf32>) -> (memref<?xf32>)
+
+  call @vecadd(%9, %10, %11) : (memref<?xf32>, memref<?xf32>, memref<?xf32>) -> ()
+  call @print_memref_f32(%8) : (memref<*xf32>) -> ()
+  return
+}
+
+func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>)
+func @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref<?xf32>) -> (memref<?xf32>)
+func @print_memref_f32(%ptr : memref<*xf32>)
diff --git a/mlir/tools/CMakeLists.txt b/mlir/tools/CMakeLists.txt
--- a/mlir/tools/CMakeLists.txt
+++ b/mlir/tools/CMakeLists.txt
@@ -2,6 +2,7 @@
 add_subdirectory(mlir-cpu-runner)
 add_subdirectory(mlir-linalg-ods-gen)
 add_subdirectory(mlir-opt)
+add_subdirectory(mlir-rocm-runner)
 add_subdirectory(mlir-translate)
 add_subdirectory(mlir-vulkan-runner)
 add_subdirectory(mlir-shlib)
diff --git a/mlir/tools/mlir-rocm-runner/CMakeLists.txt b/mlir/tools/mlir-rocm-runner/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/mlir/tools/mlir-rocm-runner/CMakeLists.txt
@@ -0,0 +1,113 @@
+set(LLVM_OPTIONAL_SOURCES
+  rocm-runtime-wrappers.cpp
+  mlir-rocm-runner.cpp
+  )
+
+if(MLIR_ROCM_RUNNER_ENABLED)
+  if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
+    message(SEND_ERROR
+      "Building the mlir rocm runner requires the AMDGPU backend")
+  endif()
+
+  # Ensure lld is enabled.
+  if (NOT "lld" IN_LIST LLVM_ENABLE_PROJECTS)
+    message(SEND_ERROR "lld is not enabled. Please revise LLVM_ENABLE_PROJECTS")
+  endif()
+
+  # lld header files.
+  include_directories(${MLIR_SOURCE_DIR}/../lld/include)
+
+  # Configure ROCm support.
+  if (NOT DEFINED ROCM_PATH)
+    if (NOT DEFINED ENV{ROCM_PATH})
+      set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
+    else()
+      set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
+    endif()
+    set(HIP_PATH "${ROCM_PATH}/hip" CACHE PATH " Path to which HIP has been installed")
+  endif()
+  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+  find_package(HIP)
+  if (NOT HIP_FOUND)
+    message(SEND_ERROR "Build the mlir rocm runner requires a working ROCm and HIP install")
+  else()
+    message(STATUS "ROCm HIP version: ${HIP_VERSION}")
+  endif()
+
+  # Locate HIP runtime library.
+  find_library(ROCM_RUNTIME_LIBRARY hip_hcc
+               PATHS "${HIP_PATH}/lib")
+  if (NOT ROCM_RUNTIME_LIBRARY)
+    message(SEND_ERROR "Could not locate ROCm HIP runtime library")
+  else()
+    message(STATUS "ROCm HIP runtime lib: ${ROCM_RUNTIME_LIBRARY}")
+  endif()
+
+  # Set HIP compile-time flags.
+  add_definitions(-D__HIP_PLATFORM_HCC__)
+
+  add_llvm_library(rocm-runtime-wrappers SHARED
+    rocm-runtime-wrappers.cpp
+  )
+  target_include_directories(rocm-runtime-wrappers
+    PRIVATE
+    "${HIP_PATH}/../include"
+    "${HIP_PATH}/include"
+    LLVMSupport
+  )
+  target_link_libraries(rocm-runtime-wrappers
+    PUBLIC
+    LLVMSupport
+    ${ROCM_RUNTIME_LIBRARY}
+  )
+
+  get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+  get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+  set(LIBS
+    ${dialect_libs}
+    ${conversion_libs}
+    lldCommon
+    lldDriver
+    lldELF
+    LLVMCore
+    LLVMLTO
+    LLVMMC
+    LLVMMCParser
+    LLVMOption
+    LLVMSupport
+    MLIRJitRunner
+    MLIRAnalysis
+    MLIREDSC
+    MLIRExecutionEngine
+    MLIRIR
+    MLIRParser
+    MLIRROCDLIR
+    MLIRSupport
+    MLIRTargetLLVMIR
+    MLIRTargetROCDLIR
+    MLIRTransforms
+    MLIRTranslation
+    ${ROCM_RUNTIME_LIBRARY}
+  )
+
+  # Manually expand the target library, since our MLIR libraries
+  # aren't plugged into the LLVM dependency tracking. If we don't
+  # do this then we can't insert the CodeGen library after ourselves
+  llvm_expand_pseudo_components(TARGET_LIBS AllTargetsCodeGens AllTargetsAsmParsers)
+  # Prepend LLVM in front of every target, this is how the library
+  # are named with CMake
+  SET(targets_to_link)
+  FOREACH(t ${TARGET_LIBS})
+    LIST(APPEND targets_to_link "LLVM${t}")
+  ENDFOREACH(t)
+
+  add_llvm_tool(mlir-rocm-runner
+    mlir-rocm-runner.cpp
+
+    DEPENDS
+    rocm-runtime-wrappers
+    )
+  llvm_update_compile_flags(mlir-rocm-runner)
+  target_link_libraries(mlir-rocm-runner PRIVATE ${LIBS} ${targets_to_link})
+
+endif()
diff --git a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
@@ -0,0 +1,248 @@
+//===- mlir-rocm-runner.cpp - MLIR ROCM Execution Driver-------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a command line utility that executes an MLIR file on the GPU by
+// translating MLIR to ROCDL/LLVM IR before JIT-compiling and executing the
+// latter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/ExecutionEngine/JitRunner.h"
+#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Target/ROCDLIR.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+
+// MC headers.
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCParser/AsmLexer.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
+
+// lld headers.
+#include "lld/Common/Driver.h"
+
+using namespace mlir;
+using namespace llvm;
+
+using Blob = SmallVector<char, 0>;
+
+static cl::opt<std::string> tripleName("triple", cl::desc("target triple"),
+                                       cl::value_desc("triple string"),
+                                       cl::init("amdgcn-amd-amdhsa"));
+
+// TODO(whchung): Add feature to automatically detect available AMD GCN ISA
+// version via `rocm-agent-enumerator` utility.
+static cl::opt<std::string> targetChip("target", cl::desc("target chip"),
+                                       cl::value_desc("AMDGPU ISA version"),
+                                       cl::init("gfx900"));
+
+static cl::opt<std::string> features("feature", cl::desc("target features"),
+                                     cl::value_desc("AMDGPU target features"),
+                                     cl::init("-code-object-v3"));
+
+static LogicalResult assembleIsa(const std::string isa, StringRef name,
+                                 Blob &result) {
+  raw_svector_ostream os(result);
+
+  std::string error;
+  Triple theTriple(Triple::normalize(tripleName));
+  const Target *theTarget =
+      TargetRegistry::lookupTarget(theTriple.normalize(), error);
+  if (!theTarget) {
+    WithColor::error(errs(), name) << error;
+    return failure();
+  }
+
+  SourceMgr srcMgr;
+  srcMgr.AddNewSourceBuffer(MemoryBuffer::getMemBuffer(isa), SMLoc());
+
+  const MCTargetOptions mcOptions;
+  std::unique_ptr<MCRegisterInfo> mri(theTarget->createMCRegInfo(tripleName));
+  std::unique_ptr<MCAsmInfo> mai(
+      theTarget->createMCAsmInfo(*mri, tripleName, mcOptions));
+  mai->setRelaxELFRelocations(true);
+
+  MCObjectFileInfo mofi;
+  MCContext ctx(mai.get(), mri.get(), &mofi, &srcMgr, &mcOptions);
+  mofi.InitMCObjectFileInfo(theTriple, false, ctx, false);
+
+  SmallString<128> cwd;
+  if (!sys::fs::current_path(cwd))
+    ctx.setCompilationDir(cwd);
+
+  std::unique_ptr<MCStreamer> mcStreamer;
+  std::unique_ptr<MCInstrInfo> mcii(theTarget->createMCInstrInfo());
+  std::unique_ptr<MCSubtargetInfo> sti(
+      theTarget->createMCSubtargetInfo(tripleName, targetChip, features));
+
+  MCCodeEmitter *ce = theTarget->createMCCodeEmitter(*mcii, *mri, ctx);
+  MCAsmBackend *mab = theTarget->createMCAsmBackend(*sti, *mri, mcOptions);
+  mcStreamer.reset(theTarget->createMCObjectStreamer(
+      theTriple, ctx, std::unique_ptr<MCAsmBackend>(mab),
+      mab->createObjectWriter(os), std::unique_ptr<MCCodeEmitter>(ce), *sti,
+      mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
+      /*DWARFMustBeAtTheEnd*/ false));
+  mcStreamer->setUseAssemblerInfoForParsing(true);
+
+  std::unique_ptr<MCAsmParser> parser(
+      createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
+  std::unique_ptr<MCTargetAsmParser> tap(
+      theTarget->createMCAsmParser(*sti, *parser, *mcii, mcOptions));
+
+  if (!tap) {
+    WithColor::error(errs(), name) << "assembler initialization error.\n";
+    return failure();
+  }
+
+  parser->setTargetParser(*tap);
+  parser->Run(false);
+
+  return success();
+}
+
+static LogicalResult createHsaco(const Blob &isaBlob, StringRef name,
+                                 Blob &hsacoBlob) {
+  // Save the ISA binary to a temp file.
+  int tempIsaBinaryFd = -1;
+  SmallString<128> tempIsaBinaryFilename;
+  std::error_code ec = sys::fs::createTemporaryFile(
+      "kernel", "o", tempIsaBinaryFd, tempIsaBinaryFilename);
+  if (ec) {
+    WithColor::error(errs(), name)
+        << "temporary file for ISA binary creation error.\n";
+    return failure();
+  }
+  FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
+  raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true);
+  tempIsaBinaryOs << isaBlob;
+  tempIsaBinaryOs.close();
+
+  // Create a temp file for HSA code object.
+  int tempHsacoFD = -1;
+  SmallString<128> tempHsacoFilename;
+  ec = sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD,
+                                    tempHsacoFilename);
+  if (ec) {
+    WithColor::error(errs(), name)
+        << "temporary file for HSA code object creation error.\n";
+    return failure();
+  }
+  FileRemover cleanupHsaco(tempHsacoFilename);
+
+  // Invoke lld. Expect a true return value from lld.
+  bool ret = lld::elf::link({"ld.lld", "-shared", tempIsaBinaryFilename.c_str(),
+                             "-o", tempHsacoFilename.c_str()},
+                            /*canEarlyExit=*/false, llvm::outs(), llvm::errs());
+  if (!ret) {
+    WithColor::error(errs(), name) << "lld invocation error.\n";
+    return failure();
+  }
+
+  // Load the HSA code object.
+  auto hsacoFile = mlir::openInputFile(tempHsacoFilename);
+  if (!hsacoFile) {
+    WithColor::error(errs(), name)
+        << "read HSA code object from temp file error.\n";
+    return failure();
+  }
+  hsacoBlob.assign(hsacoFile->getBuffer().begin(),
+                   hsacoFile->getBuffer().end());
+
+  return success();
+}
+
+static std::unique_ptr<llvm::Module> compileModuleToROCDLIR(Operation *m) {
+  auto llvmModule = translateModuleToROCDLIR(m);
+  // TODO(whchung): Link with ROCm-Device-Libs in case needed (ex: the Module
+  // depends on math functions).
+  return llvmModule;
+}
+
+static OwnedBlob compileISAToHsaco(const std::string isa, Location loc,
+                                   StringRef name) {
+  // ISA -> ISA in binary form via MC.
+  // Use lld to create HSA code object.
+  Blob isaBlob;
+  Blob hsacoBlob;
+
+  if (succeeded(assembleIsa(isa, name, isaBlob)) &&
+      succeeded(createHsaco(isaBlob, name, hsacoBlob)))
+    return std::make_unique<std::vector<char>>(hsacoBlob.begin(),
+                                               hsacoBlob.end());
+
+  WithColor::error(errs(), name) << "producing HSA code object error.\n";
+  return {};
+}
+
+static LogicalResult runMLIRPasses(ModuleOp m) {
+  PassManager pm(m.getContext());
+  applyPassManagerCLOptions(pm);
+
+  pm.addPass(createGpuKernelOutliningPass());
+  auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
+  kernelPm.addPass(createStripDebugInfoPass());
+  kernelPm.addPass(createLowerGpuOpsToROCDLOpsPass());
+  kernelPm.addPass(createConvertGPUKernelToBlobPass(
+      compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip,
+      features, /*gpuBinaryAnnotation=*/"rocdl.hsaco"));
+  pm.addPass(createLowerToLLVMPass());
+  pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
+      /*gpuBinaryAnnotation=*/"rocdl.hsaco"));
+
+  return pm.run(m);
+}
+
+int main(int argc, char **argv) {
+  registerPassManagerCLOptions();
+  mlir::registerAllDialects();
+  llvm::InitLLVM y(argc, argv);
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+
+  // Initialize LLVM AMDGPU backend.
+  LLVMInitializeAMDGPUTarget();
+  LLVMInitializeAMDGPUTargetInfo();
+  LLVMInitializeAMDGPUTargetMC();
+  LLVMInitializeAMDGPUAsmPrinter();
+
+  mlir::initializeLLVMPasses();
+  return mlir::JitRunnerMain(argc, argv, &runMLIRPasses);
+}
diff --git a/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp b/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp
@@ -0,0 +1,143 @@
+//===- rocm-runtime-wrappers.cpp - MLIR ROCM runner wrapper library -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements C wrappers around the ROCM library for easy linking in ORC jit.
+// Also adds some debugging helpers that are helpful when writing MLIR code to
+// run on GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <numeric>
+
+#include "mlir/ExecutionEngine/CRunnerUtils.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "hip/hip_runtime.h"
+
+namespace {
+int32_t reportErrorIfAny(hipError_t result, const char *where) {
+  if (result != hipSuccess) {
+    llvm::errs() << "HIP failed with " << result << " in " << where << "\n";
+  }
+  return result;
+}
+} // anonymous namespace
+
+extern "C" int32_t mgpuModuleLoad(void **module, void *data) {
+  int32_t err = reportErrorIfAny(
+      hipModuleLoadData(reinterpret_cast<hipModule_t *>(module), data),
+      "ModuleLoad");
+  return err;
+}
+
+extern "C" int32_t mgpuModuleGetFunction(void **function, void *module,
+                                         const char *name) {
+  return reportErrorIfAny(
+      hipModuleGetFunction(reinterpret_cast<hipFunction_t *>(function),
+                           reinterpret_cast<hipModule_t>(module), name),
+      "GetFunction");
+}
+
+// The wrapper uses intptr_t instead of ROCM's unsigned int to match
+// the type of MLIR's index type. This avoids the need for casts in the
+// generated MLIR code.
+extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX,
+                                    intptr_t gridY, intptr_t gridZ,
+                                    intptr_t blockX, intptr_t blockY,
+                                    intptr_t blockZ, int32_t smem, void *stream,
+                                    void **params, void **extra) {
+  return reportErrorIfAny(
+      hipModuleLaunchKernel(reinterpret_cast<hipFunction_t>(function), gridX,
+                            gridY, gridZ, blockX, blockY, blockZ, smem,
+                            reinterpret_cast<hipStream_t>(stream), params,
+                            extra),
+      "LaunchKernel");
+}
+
+extern "C" void *mgpuGetStreamHelper() {
+  hipStream_t stream;
+  reportErrorIfAny(hipStreamCreate(&stream), "StreamCreate");
+  return stream;
+}
+
+extern "C" int32_t mgpuStreamSynchronize(void *stream) {
+  return reportErrorIfAny(
+      hipStreamSynchronize(reinterpret_cast<hipStream_t>(stream)),
+      "StreamSync");
+}
+
+/// Helper functions for writing mlir example code
+
+// Allows to register byte array with the ROCM runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
+  reportErrorIfAny(hipHostRegister(ptr, sizeBytes, /*flags=*/0),
+                   "MemHostRegister");
+}
+
+// Allows to register a MemRef with the ROCM runtime. Initializes array with
+// value. Helpful until we have transfer functions implemented.
+template <typename T>
+void mgpuMemHostRegisterMemRef(T *pointer, llvm::ArrayRef<int64_t> sizes,
+                               llvm::ArrayRef<int64_t> strides, T value) {
+  assert(sizes.size() == strides.size());
+  llvm::SmallVector<int64_t, 4> denseStrides(strides.size());
+
+  std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(),
+                   std::multiplies<int64_t>());
+  auto count = denseStrides.front();
+
+  // Only densely packed tensors are currently supported.
+  std::rotate(denseStrides.begin(), denseStrides.begin() + 1,
+              denseStrides.end());
+  denseStrides.back() = 1;
+  assert(strides == llvm::makeArrayRef(denseStrides));
+
+  std::fill_n(pointer, count, value);
+  mgpuMemHostRegister(pointer, count * sizeof(T));
+}
+
+extern "C" void mgpuMemHostRegisterFloat(int64_t rank, void *ptr) {
+  auto *desc = static_cast<StridedMemRefType<float, 1> *>(ptr);
+  auto sizes = llvm::ArrayRef<int64_t>(desc->sizes, rank);
+  auto strides = llvm::ArrayRef<int64_t>(desc->sizes + rank, rank);
+  mgpuMemHostRegisterMemRef(desc->data + desc->offset, sizes, strides, 1.23f);
+}
+
+extern "C" void mgpuMemHostRegisterInt32(int64_t rank, void *ptr) {
+  auto *desc = static_cast<StridedMemRefType<int32_t, 1> *>(ptr);
+  auto sizes = llvm::ArrayRef<int64_t>(desc->sizes, rank);
+  auto strides = llvm::ArrayRef<int64_t>(desc->sizes + rank, rank);
+  mgpuMemHostRegisterMemRef(desc->data + desc->offset, sizes, strides, 123);
+}
+
+template <typename T>
+void mgpuMemGetDevicePointer(T *hostPtr, T **devicePtr) {
+  reportErrorIfAny(hipSetDevice(0), "hipSetDevice");
+  reportErrorIfAny(
+      hipHostGetDevicePointer((void **)devicePtr, hostPtr, /*flags=*/0),
+      "hipHostGetDevicePointer");
+}
+
+extern "C" StridedMemRefType<float, 1>
+mgpuMemGetDeviceMemRef1dFloat(float *allocated, float *aligned, int64_t offset,
+                              int64_t size, int64_t stride) {
+  float *devicePtr = nullptr;
+  mgpuMemGetDevicePointer(aligned, &devicePtr);
+  return {devicePtr, devicePtr, offset, {size}, {stride}};
+}
+
+extern "C" StridedMemRefType<int32_t, 1>
+mgpuMemGetDeviceMemRef1dInt32(int32_t *allocated, int32_t *aligned,
+                              int64_t offset, int64_t size, int64_t stride) {
+  int32_t *devicePtr = nullptr;
+  mgpuMemGetDevicePointer(aligned, &devicePtr);
+  return {devicePtr, devicePtr, offset, {size}, {stride}};
+}