diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h
--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -90,6 +90,10 @@
 /// annotation.
 void registerGpuSerializeToCubinPass();
 
+/// Register pass to serialize GPU kernel functions to a HSAco binary
+/// annotation.
+void registerGpuSerializeToHsacoPass();
+
 /// Generate the code for registering passes.
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/GPU/Passes.h.inc"
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -52,6 +52,7 @@
   registerAsyncPasses();
   registerGPUPasses();
   registerGpuSerializeToCubinPass();
+  registerGpuSerializeToHsacoPass();
   registerLinalgPasses();
   LLVM::registerLLVMPasses();
   quant::registerQuantPasses();
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -6,6 +6,17 @@
   )
 endif()
 
+if (MLIR_ROCM_CONVERSIONS_ENABLED)
+  set(AMDGPU_LIBS
+    MCParser
+    AMDGPUAsmParser
+    AMDGPUAsmPrinter
+    AMDGPUCodeGen
+    AMDGPUDesc
+    AMDGPUInfo
+  )
+endif()
+
 add_mlir_dialect_library(MLIRGPU
   IR/GPUDialect.cpp
   Transforms/AllReduceLowering.cpp
@@ -15,6 +26,7 @@
   Transforms/ParallelLoopMapper.cpp
   Transforms/SerializeToBlob.cpp
   Transforms/SerializeToCubin.cpp
+  Transforms/SerializeToHsaco.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
@@ -23,6 +35,7 @@
   Core
   MC
   ${NVPTX_LIBS}
+  ${AMDGPU_LIBS}
 
   DEPENDS
   MLIRGPUOpsIncGen
@@ -83,3 +96,74 @@
   )
 
 endif()
+
+if(MLIR_ROCM_RUNNER_ENABLED)
+  if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
+    message(SEND_ERROR
+      "Building the mlir rocm runner requires the AMDGPU backend")
+  endif()
+
+  # Ensure lld is enabled.
+  if (NOT "lld" IN_LIST LLVM_ENABLE_PROJECTS)
+    message(SEND_ERROR "lld is not enabled. Please revise LLVM_ENABLE_PROJECTS")
+  endif()
+
+  # lld header files.
+  include_directories(${MLIR_SOURCE_DIR}/../lld/include)
+
+  # Configure ROCm support.
+  if (NOT DEFINED ROCM_PATH)
+    if (NOT DEFINED ENV{ROCM_PATH})
+      set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
+    else()
+      set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
+    endif()
+    set(HIP_PATH "${ROCM_PATH}/hip" CACHE PATH " Path to which HIP has been installed")
+  endif()
+  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+  find_package(HIP)
+  if (NOT HIP_FOUND)
+    message(SEND_ERROR "Build the mlir rocm runner requires a working ROCm and HIP install")
+  else()
+    message(STATUS "ROCm HIP version: ${HIP_VERSION}")
+  endif()
+
+  # Set compile-time flags for ROCm path.
+  add_definitions(-D__ROCM_PATH__="${ROCM_PATH}")
+
+  # Locate HIP runtime library.
+  find_library(ROCM_RUNTIME_LIBRARY amdhip64
+               PATHS "${HIP_PATH}/lib")
+  if (NOT ROCM_RUNTIME_LIBRARY)
+    message(SEND_ERROR "Could not locate ROCm HIP runtime library")
+  else()
+    message(STATUS "ROCm HIP runtime lib: ${ROCM_RUNTIME_LIBRARY}")
+  endif()
+
+  target_compile_definitions(obj.MLIRGPU
+    PRIVATE
+    # Set HIP compile-time flags.
+    __HIP_PLATFORM_HCC__
+    # Enable gpu-to-hsaco pass.
+    MLIR_GPU_TO_HSACO_PASS_ENABLE=1
+  )
+
+  # Add ROCm headers includes.
+  target_include_directories(obj.MLIRGPU
+    PRIVATE
+    "${ROCM_PATH}/include"
+    "${HIP_PATH}/include"
+  )
+
+  target_link_libraries(MLIRGPU
+    PRIVATE
+    lldCommon
+    lldDriver
+    lldELF
+    MLIRROCDLToLLVMIRTranslation
+    ${ROCM_RUNTIME_LIBRARY}
+  )
+
+  llvm_update_compile_flags(obj.MLIRGPU)
+
+endif()
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
@@ -0,0 +1,284 @@
+//===- LowerGPUToHSACO.cpp - Convert GPU kernel to HSACO blob -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass that serializes a gpu module into HSAco blob and
+// adds that blob as a string attribute of the module.
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/GPU/Passes.h"
+
+#if MLIR_GPU_TO_HSACO_PASS_ENABLE
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Target/TargetOptions.h"
+
+#include "lld/Common/Driver.h"
+
+#include "hip/hip_version.h"
+
+#include <mutex>
+
+using namespace mlir;
+
+namespace {
+class SerializeToHsacoPass
+    : public PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass> {
+public:
+  SerializeToHsacoPass();
+
+private:
+  void getDependentDialects(DialectRegistry &registry) const override;
+
+  // Serializes ROCDL to HSACO.
+  std::unique_ptr<std::vector<char>>
+  serializeISA(const std::string &isa) override;
+
+  std::unique_ptr<SmallVectorImpl<char>> assembleIsa(const std::string &isa);
+  std::unique_ptr<std::vector<char>>
+  createHsaco(const SmallVectorImpl<char> &isaBinary);
+};
+} // namespace
+
+static std::string getDefaultChip() {
+  const char kDefaultChip[] = "gfx900";
+
+  // Locate rocm_agent_enumerator.
+  const char kRocmAgentEnumerator[] = "rocm_agent_enumerator";
+  llvm::ErrorOr<std::string> rocmAgentEnumerator = llvm::sys::findProgramByName(
+      kRocmAgentEnumerator, {__ROCM_PATH__ "/bin"});
+  if (!rocmAgentEnumerator) {
+    llvm::WithColor::warning(llvm::errs())
+        << kRocmAgentEnumerator << "couldn't be located under " << __ROCM_PATH__
+        << "/bin\n";
+    return kDefaultChip;
+  }
+
+  // Prepare temp file to hold the outputs.
+  int tempFd = -1;
+  SmallString<128> tempFilename;
+  if (llvm::sys::fs::createTemporaryFile("rocm_agent", "txt", tempFd,
+                                         tempFilename)) {
+    llvm::WithColor::warning(llvm::errs())
+        << "temporary file for " << kRocmAgentEnumerator << " creation error\n";
+    return kDefaultChip;
+  }
+  llvm::FileRemover cleanup(tempFilename);
+
+  // Invoke rocm_agent_enumerator.
+  std::string errorMessage;
+  SmallVector<StringRef, 2> args{"-t", "GPU"};
+  Optional<StringRef> redirects[3] = {{""}, tempFilename.str(), {""}};
+  int result =
+      llvm::sys::ExecuteAndWait(rocmAgentEnumerator.get(), args, llvm::None,
+                                redirects, 0, 0, &errorMessage);
+  if (result) {
+    llvm::WithColor::warning(llvm::errs())
+        << kRocmAgentEnumerator << " invocation error: " << errorMessage
+        << "\n";
+    return kDefaultChip;
+  }
+
+  // Load and parse the result.
+  auto gfxIsaList = openInputFile(tempFilename);
+  if (!gfxIsaList) {
+    llvm::WithColor::error(llvm::errs())
+        << "read ROCm agent list temp file error\n";
+    return kDefaultChip;
+  }
+  for (llvm::line_iterator lines(*gfxIsaList); !lines.is_at_end(); ++lines) {
+    // Skip the line with content "gfx000".
+    if (*lines == "gfx000")
+      continue;
+    // Use the first ISA version found.
+    return lines->str();
+  }
+
+  return kDefaultChip;
+}
+
+// Sets the 'option' to 'value' unless it already has a value.
+static void maybeSetOption(Pass::Option<std::string> &option,
+                           function_ref<std::string()> getValue) {
+  if (!option.hasValue())
+    option = getValue();
+}
+
+SerializeToHsacoPass::SerializeToHsacoPass() {
+  maybeSetOption(this->triple, [] { return "amdgcn-amd-amdhsa"; });
+  maybeSetOption(this->chip, [] {
+    static auto chip = getDefaultChip();
+    return chip;
+  });
+}
+
+void SerializeToHsacoPass::getDependentDialects(
+    DialectRegistry &registry) const {
+  registerROCDLDialectTranslation(registry);
+  gpu::SerializeToBlobPass::getDependentDialects(registry);
+}
+
+std::unique_ptr<SmallVectorImpl<char>>
+SerializeToHsacoPass::assembleIsa(const std::string &isa) {
+  auto loc = getOperation().getLoc();
+
+  SmallVector<char, 0> result;
+  llvm::raw_svector_ostream os(result);
+
+  llvm::Triple triple(llvm::Triple::normalize(this->triple));
+  std::string error;
+  const llvm::Target *target =
+      llvm::TargetRegistry::lookupTarget(triple.normalize(), error);
+  if (!target) {
+    emitError(loc, Twine("failed to lookup target: ") + error);
+    return {};
+  }
+
+  llvm::SourceMgr srcMgr;
+  srcMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(isa),
+                            llvm::SMLoc());
+
+  const llvm::MCTargetOptions mcOptions;
+  std::unique_ptr<llvm::MCRegisterInfo> mri(
+      target->createMCRegInfo(this->triple));
+  std::unique_ptr<llvm::MCAsmInfo> mai(
+      target->createMCAsmInfo(*mri, this->triple, mcOptions));
+  mai->setRelaxELFRelocations(true);
+
+  llvm::MCObjectFileInfo mofi;
+  llvm::MCContext ctx(mai.get(), mri.get(), &mofi, &srcMgr, &mcOptions);
+  mofi.InitMCObjectFileInfo(triple, false, ctx, false);
+
+  SmallString<128> cwd;
+  if (!llvm::sys::fs::current_path(cwd))
+    ctx.setCompilationDir(cwd);
+
+  std::unique_ptr<llvm::MCStreamer> mcStreamer;
+  std::unique_ptr<llvm::MCInstrInfo> mcii(target->createMCInstrInfo());
+  std::unique_ptr<llvm::MCSubtargetInfo> sti(
+      target->createMCSubtargetInfo(this->triple, this->chip, this->features));
+
+  llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(*mcii, *mri, ctx);
+  llvm::MCAsmBackend *mab = target->createMCAsmBackend(*sti, *mri, mcOptions);
+  mcStreamer.reset(target->createMCObjectStreamer(
+      triple, ctx, std::unique_ptr<llvm::MCAsmBackend>(mab),
+      mab->createObjectWriter(os), std::unique_ptr<llvm::MCCodeEmitter>(ce),
+      *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
+      /*DWARFMustBeAtTheEnd*/ false));
+  mcStreamer->setUseAssemblerInfoForParsing(true);
+
+  std::unique_ptr<llvm::MCAsmParser> parser(
+      createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
+  std::unique_ptr<llvm::MCTargetAsmParser> tap(
+      target->createMCAsmParser(*sti, *parser, *mcii, mcOptions));
+
+  if (!tap) {
+    emitError(loc, "assembler initialization error");
+    return {};
+  }
+
+  parser->setTargetParser(*tap);
+  parser->Run(false);
+
+  return std::make_unique<SmallVector<char, 0>>(std::move(result));
+}
+
+std::unique_ptr<std::vector<char>>
+SerializeToHsacoPass::createHsaco(const SmallVectorImpl<char> &isaBinary) {
+  auto loc = getOperation().getLoc();
+
+  // Save the ISA binary to a temp file.
+  int tempIsaBinaryFd = -1;
+  SmallString<128> tempIsaBinaryFilename;
+  if (llvm::sys::fs::createTemporaryFile("kernel", "o", tempIsaBinaryFd,
+                                         tempIsaBinaryFilename)) {
+    emitError(loc, "temporary file for ISA binary creation error");
+    return {};
+  }
+  llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
+  llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true);
+  tempIsaBinaryOs << StringRef(isaBinary.data(), isaBinary.size());
+  tempIsaBinaryOs.close();
+
+  // Create a temp file for HSA code object.
+  int tempHsacoFD = -1;
+  SmallString<128> tempHsacoFilename;
+  if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD,
+                                         tempHsacoFilename)) {
+    emitError(loc, "temporary file for HSA code object creation error");
+    return {};
+  }
+  llvm::FileRemover cleanupHsaco(tempHsacoFilename);
+
+  {
+    static std::mutex mutex;
+    const std::lock_guard<std::mutex> lock(mutex);
+    // Invoke lld. Expect a true return value from lld.
+    if (!lld::elf::link({"ld.lld", "-shared", tempIsaBinaryFilename.c_str(),
+                         "-o", tempHsacoFilename.c_str()},
+                        /*canEarlyExit=*/false, llvm::outs(), llvm::errs())) {
+      emitError(loc, "lld invocation error");
+      return {};
+    }
+  }
+
+  // Load the HSA code object.
+  auto hsacoFile = openInputFile(tempHsacoFilename);
+  if (!hsacoFile) {
+    emitError(loc, "read HSA code object from temp file error");
+    return {};
+  }
+
+  StringRef buffer = hsacoFile->getBuffer();
+  return std::make_unique<std::vector<char>>(buffer.begin(), buffer.end());
+}
+
+std::unique_ptr<std::vector<char>>
+SerializeToHsacoPass::serializeISA(const std::string &isa) {
+  auto isaBinary = assembleIsa(isa);
+  if (!isaBinary)
+    return {};
+  return createHsaco(*isaBinary);
+}
+
+// Register pass to serialize GPU kernel functions to a HSACO binary annotation.
+void mlir::registerGpuSerializeToHsacoPass() {
+  PassRegistration<SerializeToHsacoPass> registerSerializeToHSACO(
+      "gpu-to-hsaco", "Lower GPU kernel function to HSACO binary annotations",
+      [] {
+        // Initialize LLVM AMDGPU backend.
+        LLVMInitializeAMDGPUAsmParser();
+        LLVMInitializeAMDGPUAsmPrinter();
+        LLVMInitializeAMDGPUTarget();
+        LLVMInitializeAMDGPUTargetInfo();
+        LLVMInitializeAMDGPUTargetMC();
+
+        return std::make_unique<SerializeToHsacoPass>();
+      });
+}
+#else  // MLIR_GPU_TO_HSACO_PASS_ENABLE
+void mlir::registerGpuSerializeToHsacoPass() {}
+#endif // MLIR_GPU_TO_HSACO_PASS_ENABLE
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -7,6 +7,7 @@
   CudaRuntimeWrappers.cpp
   SparseUtils.cpp
   ExecutionEngine.cpp
+  RocmRuntimeWrappers.cpp
   RunnerUtils.cpp
   OptUtils.cpp
   JitRunner.cpp
@@ -136,3 +137,66 @@
     ${CUDA_RUNTIME_LIBRARY}
   )
 endif()
+
+if(MLIR_ROCM_RUNNER_ENABLED)
+  if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
+    message(SEND_ERROR
+      "Building the mlir rocm runner requires the AMDGPU backend")
+  endif()
+
+  # Ensure lld is enabled.
+  if (NOT "lld" IN_LIST LLVM_ENABLE_PROJECTS)
+    message(SEND_ERROR "lld is not enabled. Please revise LLVM_ENABLE_PROJECTS")
+  endif()
+
+  # lld header files.
+  include_directories(${MLIR_SOURCE_DIR}/../lld/include)
+
+  # Configure ROCm support.
+  if (NOT DEFINED ROCM_PATH)
+    if (NOT DEFINED ENV{ROCM_PATH})
+      set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
+    else()
+      set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
+    endif()
+    set(HIP_PATH "${ROCM_PATH}/hip" CACHE PATH "Path to which HIP has been installed")
+  endif()
+  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+  find_package(HIP)
+  if (NOT HIP_FOUND)
+    message(SEND_ERROR "Build the mlir rocm runner requires a working ROCm and HIP install")
+  else()
+    message(STATUS "ROCm HIP version: ${HIP_VERSION}")
+  endif()
+
+  # Set compile-time flags for ROCm path.
+  add_definitions(-D__ROCM_PATH__="${ROCM_PATH}")
+
+  # Locate HIP runtime library.
+  find_library(ROCM_RUNTIME_LIBRARY amdhip64
+               PATHS "${HIP_PATH}/lib")
+  if (NOT ROCM_RUNTIME_LIBRARY)
+    message(SEND_ERROR "Could not locate ROCm HIP runtime library")
+  else()
+    message(STATUS "ROCm HIP runtime lib: ${ROCM_RUNTIME_LIBRARY}")
+  endif()
+
+  # Set HIP compile-time flags.
+  add_definitions(-D__HIP_PLATFORM_HCC__)
+
+  add_mlir_library(mlir_rocm_runtime
+    SHARED
+    RocmRuntimeWrappers.cpp
+
+    EXCLUDE_FROM_LIBMLIR
+  )
+  target_include_directories(mlir_rocm_runtime
+    PRIVATE
+    "${ROCM_PATH}/include"
+    "${HIP_PATH}/include"
+  )
+  target_link_libraries(mlir_rocm_runtime
+    PRIVATE
+    ${ROCM_RUNTIME_LIBRARY}
+  )
+endif()
\ No newline at end of file
diff --git a/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
rename from mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp
rename to mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
--- a/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -1,4 +1,4 @@
-//===- rocm-runtime-wrappers.cpp - MLIR ROCM runner wrapper library -------===//
+//===- RocmRuntimeWrappers.cpp - MLIR ROCM runner wrapper library ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -30,29 +30,25 @@
     fprintf(stderr, "'%s' failed with '%s'\n", #expr, name);                   \
   }(expr)
 
-// Static reference to HIP primary context for device ordinal 0.
-static hipCtx_t Context = [] {
-  HIP_REPORT_IF_ERROR(hipInit(/*flags=*/0));
-  hipDevice_t device;
-  HIP_REPORT_IF_ERROR(hipDeviceGet(&device, /*ordinal=*/0));
-  hipCtx_t context;
-  HIP_REPORT_IF_ERROR(hipDevicePrimaryCtxRetain(&context, device));
-  return context;
-}();
-
 // Sets the `Context` for the duration of the instance and restores the previous
 // context on destruction.
 class ScopedContext {
 public:
   ScopedContext() {
-    HIP_REPORT_IF_ERROR(hipCtxGetCurrent(&previous));
-    HIP_REPORT_IF_ERROR(hipCtxSetCurrent(Context));
+    // Static reference to HIP primary context for device ordinal 0.
+    static hipCtx_t context = [] {
+      HIP_REPORT_IF_ERROR(hipInit(/*flags=*/0));
+      hipDevice_t device;
+      HIP_REPORT_IF_ERROR(hipDeviceGet(&device, /*ordinal=*/0));
+      hipCtx_t ctx;
+      HIP_REPORT_IF_ERROR(hipDevicePrimaryCtxRetain(&ctx, device));
+      return ctx;
+    }();
+
+    HIP_REPORT_IF_ERROR(hipCtxPushCurrent(context));
   }
 
-  ~ScopedContext() { HIP_REPORT_IF_ERROR(hipCtxSetCurrent(previous)); }
-
-private:
-  hipCtx_t previous;
+  ~ScopedContext() { HIP_REPORT_IF_ERROR(hipCtxPopCurrent(nullptr)); }
 };
 
 extern "C" hipModule_t mgpuModuleLoad(void *data) {
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -21,8 +21,7 @@
 set(MLIR_RUNNER_UTILS_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 
 # Passed to lit.site.cfg.py.in to set up the path where to find the libraries
-# for the mlir rocm / spirv / vulkan runner tests.
-set(MLIR_ROCM_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+# for the mlir spirv / vulkan runner tests.
 set(MLIR_SPIRV_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 set(MLIR_VULKAN_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 
@@ -74,6 +73,10 @@
   list(APPEND MLIR_TEST_DEPENDS mlir_cuda_runtime)
 endif()
 
+if(MLIR_ROCM_RUNNER_ENABLED)
+  list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime)
+endif()
+
 list(APPEND MLIR_TEST_DEPENDS MLIRUnitTests)
 
 if(LLVM_BUILD_EXAMPLES)
@@ -88,12 +91,6 @@
     )
 endif()
 
-if(MLIR_ROCM_RUNNER_ENABLED)
-  list(APPEND MLIR_TEST_DEPENDS
-    mlir-rocm-runner
-  )
-endif()
-
 if(MLIR_SPIRV_CPU_RUNNER_ENABLED)
   add_subdirectory(mlir-spirv-cpu-runner)
   list(APPEND MLIR_TEST_DEPENDS
diff --git a/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir
--- a/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir
+++ b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt %s --test-kernel-to-hsaco -split-input-file | FileCheck %s
+// RUN: mlir-opt %s --test-gpu-to-hsaco | FileCheck %s
 
-// CHECK: attributes {rocdl.hsaco = "HSACO"}
+// CHECK: gpu.module @foo attributes {gpu.binary = "HSACO"}
 gpu.module @foo {
   llvm.func @kernel(%arg0 : f32, %arg1 : !llvm.ptr<f32>)
     // CHECK: attributes  {gpu.kernel}
@@ -9,8 +9,7 @@
   }
 }
 
-// -----
-
+// CHECK: gpu.module @bar attributes {gpu.binary = "HSACO"}
 gpu.module @bar {
   // CHECK: func @kernel_a
   llvm.func @kernel_a()
diff --git a/mlir/test/Integration/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/GPU/CUDA/lit.local.cfg
--- a/mlir/test/Integration/GPU/CUDA/lit.local.cfg
+++ b/mlir/test/Integration/GPU/CUDA/lit.local.cfg
@@ -1,2 +1,2 @@
 if not config.enable_cuda_runner:
-  config.unsupported = True
\ No newline at end of file
+  config.unsupported = True
diff --git a/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
rename from mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir
rename to mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
--- a/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir
+++ b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
@@ -1,5 +1,9 @@
-// RUN: mlir-rocm-runner %s \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN: mlir-opt %s \
+// RUN:   -gpu-kernel-outlining \
+// RUN:   -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco)' \
+// RUN:   -gpu-to-llvm \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-rocm-runner/lit.local.cfg b/mlir/test/Integration/GPU/ROCM/lit.local.cfg
rename from mlir/test/mlir-rocm-runner/lit.local.cfg
rename to mlir/test/Integration/GPU/ROCM/lit.local.cfg
diff --git a/mlir/test/mlir-rocm-runner/two-modules.mlir b/mlir/test/Integration/GPU/ROCM/two-modules.mlir
rename from mlir/test/mlir-rocm-runner/two-modules.mlir
rename to mlir/test/Integration/GPU/ROCM/two-modules.mlir
--- a/mlir/test/mlir-rocm-runner/two-modules.mlir
+++ b/mlir/test/Integration/GPU/ROCM/two-modules.mlir
@@ -1,5 +1,9 @@
-// RUN: mlir-rocm-runner %s \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN: mlir-opt %s \
+// RUN:   -gpu-kernel-outlining \
+// RUN:   -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco)' \
+// RUN:   -gpu-to-llvm \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-rocm-runner/vecadd.mlir b/mlir/test/Integration/GPU/ROCM/vecadd.mlir
rename from mlir/test/mlir-rocm-runner/vecadd.mlir
rename to mlir/test/Integration/GPU/ROCM/vecadd.mlir
--- a/mlir/test/mlir-rocm-runner/vecadd.mlir
+++ b/mlir/test/Integration/GPU/ROCM/vecadd.mlir
@@ -1,5 +1,9 @@
-// RUN: mlir-rocm-runner %s \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN: mlir-opt %s \
+// RUN:   -gpu-kernel-outlining \
+// RUN:   -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco)' \
+// RUN:   -gpu-to-llvm \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-rocm-runner/vector-transferops.mlir b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
rename from mlir/test/mlir-rocm-runner/vector-transferops.mlir
rename to mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
--- a/mlir/test/mlir-rocm-runner/vector-transferops.mlir
+++ b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
@@ -1,5 +1,9 @@
-// RUN: mlir-rocm-runner %s \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN: mlir-opt %s \
+// RUN:   -gpu-kernel-outlining \
+// RUN:   -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco)' \
+// RUN:   -gpu-to-llvm \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp b/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp
--- a/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp
+++ b/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp
@@ -6,11 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
-#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
+
 #include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
 #include "llvm/Support/TargetSelect.h"
@@ -18,38 +16,54 @@
 using namespace mlir;
 
 #if MLIR_ROCM_CONVERSIONS_ENABLED
-static OwnedBlob compileIsaToHsacoForTesting(const std::string &, Location,
-                                             StringRef) {
-  const char data[] = "HSACO";
-  return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
+namespace {
+class TestSerializeToHsacoPass
+    : public PassWrapper<TestSerializeToHsacoPass, gpu::SerializeToBlobPass> {
+public:
+  TestSerializeToHsacoPass();
+
+private:
+  void getDependentDialects(DialectRegistry &registry) const override;
+
+  // Serializes ROCDL IR to HSACO.
+  std::unique_ptr<std::vector<char>>
+  serializeISA(const std::string &isa) override;
+};
+} // namespace
+
+TestSerializeToHsacoPass::TestSerializeToHsacoPass() {
+  this->triple = "amdgcn-amd-amdhsa";
+  this->chip = "gfx900";
+}
+
+void TestSerializeToHsacoPass::getDependentDialects(
+    DialectRegistry &registry) const {
+  registerROCDLDialectTranslation(registry);
+  gpu::SerializeToBlobPass::getDependentDialects(registry);
 }
 
-static std::unique_ptr<llvm::Module>
-translateModuleToROCDL(Operation *m, llvm::LLVMContext &llvmContext,
-                       StringRef moduleName) {
-  registerLLVMDialectTranslation(*m->getContext());
-  registerROCDLDialectTranslation(*m->getContext());
-  return translateModuleToLLVMIR(m, llvmContext, moduleName);
+std::unique_ptr<std::vector<char>>
+TestSerializeToHsacoPass::serializeISA(const std::string &) {
+  std::string data = "HSACO";
+  return std::make_unique<std::vector<char>>(data.begin(), data.end());
 }
 
 namespace mlir {
 namespace test {
-void registerTestConvertGPUKernelToHsacoPass() {
-  PassPipelineRegistration<>(
-      "test-kernel-to-hsaco",
-      "Convert all kernel functions to ROCm hsaco blobs",
-      [](OpPassManager &pm) {
+// Register test pass to serialize GPU module to a HSAco binary annotation.
+void registerTestGpuSerializeToHsacoPass() {
+  PassRegistration<TestSerializeToHsacoPass> registerSerializeToHsaco(
+      "test-gpu-to-hsaco",
+      "Lower GPU kernel function to HSAco binary annotations", [] {
         // Initialize LLVM AMDGPU backend.
         LLVMInitializeAMDGPUTarget();
         LLVMInitializeAMDGPUTargetInfo();
         LLVMInitializeAMDGPUTargetMC();
         LLVMInitializeAMDGPUAsmPrinter();
 
-        pm.addPass(createConvertGPUKernelToBlobPass(
-            translateModuleToROCDL, compileIsaToHsacoForTesting,
-            "amdgcn-amd-amdhsa", "gfx900", "-code-object-v3", "rocdl.hsaco"));
+        return std::make_unique<TestSerializeToHsacoPass>();
       });
 }
 } // namespace test
 } // namespace mlir
-#endif
+#endif // MLIR_ROCM_CONVERSIONS_ENABLED
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -77,7 +77,6 @@
     ToolSubst('toy-ch5', unresolved='ignore'),
     ToolSubst('%linalg_test_lib_dir', config.linalg_test_lib_dir, unresolved='ignore'),
     ToolSubst('%mlir_runner_utils_dir', config.mlir_runner_utils_dir, unresolved='ignore'),
-    ToolSubst('%rocm_wrapper_library_dir', config.rocm_wrapper_library_dir, unresolved='ignore'),
     ToolSubst('%spirv_wrapper_library_dir', config.spirv_wrapper_library_dir, unresolved='ignore'),
     ToolSubst('%vulkan_wrapper_library_dir', config.vulkan_wrapper_library_dir, unresolved='ignore'),
     ToolSubst('%mlir_integration_test_dir', config.mlir_integration_test_dir, unresolved='ignore'),
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -39,7 +39,6 @@
 config.run_cuda_tests = @MLIR_CUDA_CONVERSIONS_ENABLED@
 config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@
 config.run_rocm_tests = @MLIR_ROCM_CONVERSIONS_ENABLED@
-config.rocm_wrapper_library_dir = "@MLIR_ROCM_WRAPPER_LIBRARY_DIR@"
 config.enable_rocm_runner = @MLIR_ROCM_RUNNER_ENABLED@
 config.spirv_wrapper_library_dir = "@MLIR_SPIRV_WRAPPER_LIBRARY_DIR@"
 config.enable_spirv_cpu_runner = @MLIR_SPIRV_CPU_RUNNER_ENABLED@
diff --git a/mlir/tools/CMakeLists.txt b/mlir/tools/CMakeLists.txt
--- a/mlir/tools/CMakeLists.txt
+++ b/mlir/tools/CMakeLists.txt
@@ -1,7 +1,6 @@
 add_subdirectory(mlir-cpu-runner)
 add_subdirectory(mlir-opt)
 add_subdirectory(mlir-reduce)
-add_subdirectory(mlir-rocm-runner)
 add_subdirectory(mlir-shlib)
 add_subdirectory(mlir-spirv-cpu-runner)
 add_subdirectory(mlir-translate)
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -65,7 +65,7 @@
 void registerTestConstantFold();
 void registerTestConvVectorization();
 void registerTestGpuSerializeToCubinPass();
-void registerTestConvertGPUKernelToHsacoPass();
+void registerTestGpuSerializeToHsacoPass();
 void registerTestDataLayoutQuery();
 void registerTestDecomposeCallGraphTypes();
 void registerTestDialect(DialectRegistry &);
@@ -140,7 +140,7 @@
   test::registerTestGpuSerializeToCubinPass();
 #endif
 #if MLIR_ROCM_CONVERSIONS_ENABLED
-  test::registerTestConvertGPUKernelToHsacoPass();
+  test::registerTestGpuSerializeToHsacoPass();
 #endif
   test::registerTestConvVectorization();
   test::registerTestDecomposeCallGraphTypes();
diff --git a/mlir/tools/mlir-rocm-runner/CMakeLists.txt b/mlir/tools/mlir-rocm-runner/CMakeLists.txt
deleted file mode 100644
--- a/mlir/tools/mlir-rocm-runner/CMakeLists.txt
+++ /dev/null
@@ -1,127 +0,0 @@
-set(LLVM_OPTIONAL_SOURCES
-  rocm-runtime-wrappers.cpp
-  mlir-rocm-runner.cpp
-  )
-
-if(MLIR_ROCM_RUNNER_ENABLED)
-  if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
-    message(SEND_ERROR
-      "Building the mlir rocm runner requires the AMDGPU backend")
-  endif()
-
-  # Ensure lld is enabled.
-  if (NOT "lld" IN_LIST LLVM_ENABLE_PROJECTS)
-    message(SEND_ERROR "lld is not enabled. Please revise LLVM_ENABLE_PROJECTS")
-  endif()
-
-  # lld header files.
-  include_directories(${MLIR_SOURCE_DIR}/../lld/include)
-
-  # Configure ROCm support.
-  if (NOT DEFINED ROCM_PATH)
-    if (NOT DEFINED ENV{ROCM_PATH})
-      set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
-    else()
-      set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
-    endif()
-    set(HIP_PATH "${ROCM_PATH}/hip" CACHE PATH " Path to which HIP has been installed")
-  endif()
-  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
-  find_package(HIP)
-  if (NOT HIP_FOUND)
-    message(SEND_ERROR "Build the mlir rocm runner requires a working ROCm and HIP install")
-  else()
-    message(STATUS "ROCm HIP version: ${HIP_VERSION}")
-  endif()
-
-  # Set compile-time flags for ROCm path.
-  add_definitions(-D__ROCM_PATH__="${ROCM_PATH}")
-
-  # Locate HIP runtime library.
-  find_library(ROCM_RUNTIME_LIBRARY amdhip64
-               PATHS "${HIP_PATH}/lib")
-  if (NOT ROCM_RUNTIME_LIBRARY)
-    message(SEND_ERROR "Could not locate ROCm HIP runtime library")
-  else()
-    message(STATUS "ROCm HIP runtime lib: ${ROCM_RUNTIME_LIBRARY}")
-  endif()
-
-  # Set HIP compile-time flags.
-  add_definitions(-D__HIP_PLATFORM_HCC__)
-
-  add_mlir_library(rocm-runtime-wrappers
-    SHARED
-    rocm-runtime-wrappers.cpp
-
-    EXCLUDE_FROM_LIBMLIR
-  )
-  target_include_directories(rocm-runtime-wrappers
-    PRIVATE
-    "${HIP_PATH}/../include"
-    "${HIP_PATH}/include"
-  )
-  target_link_libraries(rocm-runtime-wrappers
-    PRIVATE
-    ${ROCM_RUNTIME_LIBRARY}
-  )
-
-  get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
-  set(LIBS
-    ${conversion_libs}
-    lldCommon
-    lldDriver
-    lldELF
-    MLIRJitRunner
-    MLIRAnalysis
-    MLIREDSC
-    MLIRExecutionEngine
-    MLIRGPU
-    MLIRIR
-    MLIRLLVMIR
-    MLIRLLVMToLLVMIRTranslation
-    MLIRParser
-    MLIRROCDLIR
-    MLIRStandard
-    MLIRSupport
-    MLIRTargetLLVMIRExport
-    MLIRROCDLToLLVMIRTranslation
-    MLIRTransforms
-    MLIRTranslation
-    ${ROCM_RUNTIME_LIBRARY}
-  )
-
-  # Manually expand the target library, since our MLIR libraries
-  # aren't plugged into the LLVM dependency tracking. If we don't
-  # do this then we can't insert the CodeGen library after ourselves
-  llvm_expand_pseudo_components(TARGET_LIBS AllTargetsCodeGens AllTargetsAsmParsers)
-  # Prepend LLVM in front of every target, this is how the library
-  # are named with CMake
-  SET(targets_to_link)
-  FOREACH(t ${TARGET_LIBS})
-    LIST(APPEND targets_to_link "LLVM${t}")
-  ENDFOREACH(t)
-
-  add_llvm_tool(mlir-rocm-runner
-    mlir-rocm-runner.cpp
-
-    DEPENDS
-    rocm-runtime-wrappers
-
-    LINK_COMPONENTS
-
-    Core
-    LTO
-    MC
-    MCParser
-    Option
-    Support
-    )
-  llvm_update_compile_flags(mlir-rocm-runner)
-  target_include_directories(mlir-rocm-runner
-    PRIVATE
-    "${HIP_PATH}/../include"
-    "${HIP_PATH}/include"
-  )
-  target_link_libraries(mlir-rocm-runner PRIVATE ${LIBS} ${targets_to_link})
-
-endif()
diff --git a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
deleted file mode 100644
--- a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp
+++ /dev/null
@@ -1,349 +0,0 @@
-//===- mlir-rocm-runner.cpp - MLIR ROCM Execution Driver-------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a command line utility that executes an MLIR file on the GPU by
-// translating MLIR to ROCDL/LLVM IR before JIT-compiling and executing the
-// latter.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/STLExtras.h"
-
-#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
-#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
-#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
-#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
-#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
-#include "mlir/Dialect/GPU/GPUDialect.h"
-#include "mlir/Dialect/GPU/Passes.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/ExecutionEngine/JitRunner.h"
-#include "mlir/ExecutionEngine/OptUtils.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/FileUtilities.h"
-#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Export.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/Passes.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/FileUtilities.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/LineIterator.h"
-#include "llvm/Support/Program.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-
-// MC headers.
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCParser/AsmLexer.h"
-#include "llvm/MC/MCParser/MCTargetAsmParser.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCTargetOptionsCommandFlags.h"
-
-// lld headers.
-#include "lld/Common/Driver.h"
-
-// HIP headers.
-#include "hip/hip_version.h"
-
-#include <mutex>
-
-using namespace mlir;
-using namespace llvm;
-
-using Blob = SmallVector<char, 0>;
-
-static cl::opt<std::string> tripleName("triple", cl::desc("target triple"),
-                                       cl::value_desc("triple string"),
-                                       cl::init("amdgcn-amd-amdhsa"));
-
-static cl::opt<std::string> targetChip("target", cl::desc("target chip"),
-                                       cl::value_desc("AMDGPU ISA version"),
-                                       cl::init(""));
-
-static cl::opt<std::string> features("feature", cl::desc("target features"),
-                                     cl::value_desc("AMDGPU target features"),
-                                     cl::init(""));
-
-static constexpr const char kRunnerProgram[] = "mlir-rocm-runner";
-static constexpr const char kRocmAgentEnumerator[] = "rocm_agent_enumerator";
-static constexpr const char kDefaultTargetChip[] = "gfx900";
-
-static LogicalResult assembleIsa(const std::string isa, StringRef name,
-                                 Blob &result) {
-  raw_svector_ostream os(result);
-
-  std::string error;
-  Triple theTriple(Triple::normalize(tripleName));
-  const Target *theTarget =
-      TargetRegistry::lookupTarget(theTriple.normalize(), error);
-  if (!theTarget) {
-    WithColor::error(errs(), name) << error;
-    return failure();
-  }
-
-  SourceMgr srcMgr;
-  srcMgr.AddNewSourceBuffer(MemoryBuffer::getMemBuffer(isa), SMLoc());
-
-  const MCTargetOptions mcOptions;
-  std::unique_ptr<MCRegisterInfo> mri(theTarget->createMCRegInfo(tripleName));
-  std::unique_ptr<MCAsmInfo> mai(
-      theTarget->createMCAsmInfo(*mri, tripleName, mcOptions));
-  mai->setRelaxELFRelocations(true);
-
-  MCObjectFileInfo mofi;
-  MCContext ctx(mai.get(), mri.get(), &mofi, &srcMgr, &mcOptions);
-  mofi.InitMCObjectFileInfo(theTriple, false, ctx, false);
-
-  SmallString<128> cwd;
-  if (!sys::fs::current_path(cwd))
-    ctx.setCompilationDir(cwd);
-
-  std::unique_ptr<MCStreamer> mcStreamer;
-  std::unique_ptr<MCInstrInfo> mcii(theTarget->createMCInstrInfo());
-  std::unique_ptr<MCSubtargetInfo> sti(
-      theTarget->createMCSubtargetInfo(tripleName, targetChip, features));
-
-  MCCodeEmitter *ce = theTarget->createMCCodeEmitter(*mcii, *mri, ctx);
-  MCAsmBackend *mab = theTarget->createMCAsmBackend(*sti, *mri, mcOptions);
-  mcStreamer.reset(theTarget->createMCObjectStreamer(
-      theTriple, ctx, std::unique_ptr<MCAsmBackend>(mab),
-      mab->createObjectWriter(os), std::unique_ptr<MCCodeEmitter>(ce), *sti,
-      mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
-      /*DWARFMustBeAtTheEnd*/ false));
-  mcStreamer->setUseAssemblerInfoForParsing(true);
-
-  std::unique_ptr<MCAsmParser> parser(
-      createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
-  std::unique_ptr<MCTargetAsmParser> tap(
-      theTarget->createMCAsmParser(*sti, *parser, *mcii, mcOptions));
-
-  if (!tap) {
-    WithColor::error(errs(), name) << "assembler initialization error.\n";
-    return failure();
-  }
-
-  parser->setTargetParser(*tap);
-  parser->Run(false);
-
-  return success();
-}
-
-static std::mutex mutex;
-static LogicalResult createHsaco(const Blob &isaBlob, StringRef name,
-                                 Blob &hsacoBlob) {
-  // Save the ISA binary to a temp file.
-  int tempIsaBinaryFd = -1;
-  SmallString<128> tempIsaBinaryFilename;
-  std::error_code ec = sys::fs::createTemporaryFile(
-      "kernel", "o", tempIsaBinaryFd, tempIsaBinaryFilename);
-  if (ec) {
-    WithColor::error(errs(), name)
-        << "temporary file for ISA binary creation error.\n";
-    return failure();
-  }
-  FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
-  raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true);
-  tempIsaBinaryOs << isaBlob;
-  tempIsaBinaryOs.close();
-
-  // Create a temp file for HSA code object.
-  int tempHsacoFD = -1;
-  SmallString<128> tempHsacoFilename;
-  ec = sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD,
-                                    tempHsacoFilename);
-  if (ec) {
-    WithColor::error(errs(), name)
-        << "temporary file for HSA code object creation error.\n";
-    return failure();
-  }
-  FileRemover cleanupHsaco(tempHsacoFilename);
-
-  const std::lock_guard<std::mutex> lock(mutex);
-  // Invoke lld. Expect a true return value from lld.
-  bool ret = lld::elf::link({"ld.lld", "-shared", tempIsaBinaryFilename.c_str(),
-                             "-o", tempHsacoFilename.c_str()},
-                            /*canEarlyExit=*/false, llvm::outs(), llvm::errs());
-  if (!ret) {
-    WithColor::error(errs(), name) << "lld invocation error.\n";
-    return failure();
-  }
-
-  // Load the HSA code object.
-  auto hsacoFile = mlir::openInputFile(tempHsacoFilename);
-  if (!hsacoFile) {
-    WithColor::error(errs(), name)
-        << "read HSA code object from temp file error.\n";
-    return failure();
-  }
-  hsacoBlob.assign(hsacoFile->getBuffer().begin(),
-                   hsacoFile->getBuffer().end());
-
-  return success();
-}
-
-static std::unique_ptr<llvm::Module>
-compileModuleToROCDLIR(Operation *m, llvm::LLVMContext &llvmContext,
-                       StringRef name) {
-  auto llvmModule = translateModuleToROCDLIR(m, llvmContext, name);
-  // TODO: Link with ROCm-Device-Libs in case needed (ex: the Module
-  // depends on math functions).
-  return llvmModule;
-}
-
-static OwnedBlob compileISAToHsaco(const std::string isa, Location loc,
-                                   StringRef name) {
-  // ISA -> ISA in binary form via MC.
-  // Use lld to create HSA code object.
-  Blob isaBlob;
-  Blob hsacoBlob;
-
-  if (succeeded(assembleIsa(isa, name, isaBlob)) &&
-      succeeded(createHsaco(isaBlob, name, hsacoBlob)))
-    return std::make_unique<std::vector<char>>(hsacoBlob.begin(),
-                                               hsacoBlob.end());
-
-  WithColor::error(errs(), name) << "producing HSA code object error.\n";
-  return {};
-}
-
-static void configTargetChip() {
-  // Set targetChip to default value first.
-  targetChip = kDefaultTargetChip;
-
-  // Locate rocm_agent_enumerator.
-  llvm::ErrorOr<std::string> rocmAgentEnumerator = llvm::sys::findProgramByName(
-      kRocmAgentEnumerator, {__ROCM_PATH__ "/bin"});
-  std::error_code ec;
-  if ((ec = rocmAgentEnumerator.getError())) {
-    WithColor::warning(errs(), kRunnerProgram)
-        << kRocmAgentEnumerator << " couldn't be located under "
-        << __ROCM_PATH__ << ", set target as " << kDefaultTargetChip << "\n";
-    return;
-  }
-
-  // Prepare temp file to hold the outputs.
-  int tempFd = -1;
-  SmallString<128> tempFilename;
-  ec = sys::fs::createTemporaryFile("rocm_agent", "txt", tempFd, tempFilename);
-  if (ec) {
-    WithColor::warning(errs(), kRunnerProgram)
-        << "temporary file for " << kRocmAgentEnumerator
-        << " creation error, set target as " << kDefaultTargetChip << "\n";
-    return;
-  }
-  FileRemover cleanup(tempFilename);
-
-  // Invoke rocm_agent_enumerator.
-  std::string errorMessage;
-  SmallVector<StringRef, 2> args{"-t", "GPU"};
-  Optional<StringRef> redirects[3] = {{""}, tempFilename.str(), {""}};
-  int result =
-      llvm::sys::ExecuteAndWait(rocmAgentEnumerator.get(), args, llvm::None,
-                                redirects, 0, 0, &errorMessage);
-  if (result) {
-    WithColor::warning(errs(), kRunnerProgram)
-        << kRocmAgentEnumerator << " invocation error: " << errorMessage
-        << ", set target as " << kDefaultTargetChip << "\n";
-    return;
-  }
-
-  // Load and parse the result.
-  auto gfxIsaList = mlir::openInputFile(tempFilename);
-  if (!gfxIsaList) {
-    WithColor::error(errs(), kRunnerProgram)
-        << "read ROCm agent list temp file error, set target as "
-        << kDefaultTargetChip << "\n";
-    return;
-  }
-  for (line_iterator lines(*gfxIsaList); !lines.is_at_end(); ++lines) {
-    // Skip the line with content "gfx000".
-    if (*lines == "gfx000")
-      continue;
-    // Use the first ISA version found.
-    targetChip = lines->str();
-    break;
-  }
-}
-
-static void configTargetFeatures() {
-  if (features.size() > 0)
-    features += ",";
-  // After ROCm 3.5, adopt HSA code object V3.
-  if (HIP_VERSION_MAJOR >= 3 && HIP_VERSION_MINOR >= 5)
-    features += "+code-object-v3";
-  else
-    features += "-code-object-v3";
-}
-
-static LogicalResult runMLIRPasses(ModuleOp m) {
-  PassManager pm(m.getContext());
-  applyPassManagerCLOptions(pm);
-
-  // Configure target chip ISA version if it has not been specified.
-  if (!targetChip.size())
-    configTargetChip();
-
-  // Configure target features per ROCm / HIP version.
-  configTargetFeatures();
-
-  const char gpuBinaryAnnotation[] = "rocdl.hsaco";
-  pm.addPass(createLowerToCFGPass());
-  pm.addPass(createGpuKernelOutliningPass());
-  auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
-  kernelPm.addPass(createStripDebugInfoPass());
-  kernelPm.addPass(createLowerGpuOpsToROCDLOpsPass());
-  kernelPm.addPass(createConvertGPUKernelToBlobPass(
-      compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip,
-      features, gpuBinaryAnnotation));
-  pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation));
-
-  return pm.run(m);
-}
-
-int main(int argc, char **argv) {
-  registerPassManagerCLOptions();
-  llvm::InitLLVM y(argc, argv);
-  llvm::InitializeAllTargetInfos();
-  llvm::InitializeAllTargetMCs();
-  llvm::InitializeAllAsmParsers();
-
-  // Initialize LLVM AMDGPU backend.
-  LLVMInitializeAMDGPUTarget();
-  LLVMInitializeAMDGPUTargetInfo();
-  LLVMInitializeAMDGPUTargetMC();
-  LLVMInitializeAMDGPUAsmPrinter();
-
-  mlir::initializeLLVMPasses();
-
-  mlir::JitRunnerConfig jitRunnerConfig;
-  jitRunnerConfig.mlirTransformer = runMLIRPasses;
-
-  mlir::DialectRegistry registry;
-  registry.insert<mlir::LLVM::LLVMDialect, mlir::gpu::GPUDialect,
-                  mlir::ROCDL::ROCDLDialect, mlir::StandardOpsDialect>();
-  mlir::registerLLVMDialectTranslation(registry);
-  mlir::registerROCDLDialectTranslation(registry);
-
-  return mlir::JitRunnerMain(argc, argv, registry, jitRunnerConfig);
-}