diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -31,6 +31,15 @@
 # TODO: we should use a config.h file like LLVM does
 add_definitions(-DMLIR_CUDA_CONVERSIONS_ENABLED=${MLIR_CUDA_CONVERSIONS_ENABLED})
 
+# Build the ROCm conversions and run according tests if the AMDGPU backend
+# is available
+if ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
+  set(MLIR_ROCM_CONVERSIONS_ENABLED 1)
+else()
+  set(MLIR_ROCM_CONVERSIONS_ENABLED 0)
+endif()
+add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ROCM_CONVERSIONS_ENABLED})
+
 set(MLIR_CUDA_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir CUDA runner")
 set(MLIR_VULKAN_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir Vulkan runner")
 
diff --git a/mlir/include/mlir/Conversion/GPUToROCm/GPUToROCmPass.h b/mlir/include/mlir/Conversion/GPUToROCm/GPUToROCmPass.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Conversion/GPUToROCm/GPUToROCmPass.h
@@ -0,0 +1,52 @@
+//===- GPUToROCmPass.h - MLIR ROCm runtime support --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUTOROCM_GPUTOROCMPASS_H_
+#define MLIR_CONVERSION_GPUTOROCM_GPUTOROCMPASS_H_
+
+#include "mlir/Support/LLVM.h"
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace mlir {
+
+class Location;
+class ModuleOp;
+
+template <typename T>
+class OperationPass;
+
+namespace gpu {
+class GPUModuleOp;
+} // namespace gpu
+
+namespace LLVM {
+class LLVMDialect;
+} // namespace LLVM
+
+using OwnedHsaco = std::unique_ptr<std::vector<char>>;
+using HsacoGenerator =
+    std::function<OwnedHsaco(const std::string &, Location, StringRef)>;
+
+/// Creates a pass to convert kernel functions into HSA code object blobs.
+///
+/// This transformation takes the body of each function that is annotated with
+/// the 'gpu.kernel' attribute, copies it to a new LLVM module, compiles the
+/// module with help of the AMDGPU backend to HSA code object and then invokes
+/// the provided hsacoGenerator to produce a binary blob (the hsaco). Such blob
+/// is then attached as a string attribute named 'rocdl.hsaco' to the kernel
+/// function.
+/// After the transformation, the body of the kernel function is removed (i.e.,
+/// it is turned into a declaration).
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
+createConvertGPUKernelToHsacoPass(HsacoGenerator hsacoGenerator);
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTOROCM_GPUTOROCMPASS_H_
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -18,6 +18,7 @@
 #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
+#include "mlir/Conversion/GPUToROCm/GPUToROCmPass.h"
 #include "mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h"
 #include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -3,6 +3,7 @@
 add_subdirectory(GPUToCUDA)
 add_subdirectory(GPUToNVVM)
 add_subdirectory(GPUToROCDL)
+add_subdirectory(GPUToROCm)
 add_subdirectory(GPUToSPIRV)
 add_subdirectory(GPUToVulkan)
 add_subdirectory(LinalgToLLVM)
diff --git a/mlir/lib/Conversion/GPUToROCm/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCm/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToROCm/CMakeLists.txt
@@ -0,0 +1,40 @@
+set(LLVM_OPTIONAL_SOURCES
+  ConvertKernelFuncToHsaco.cpp
+)
+
+# TBD
+# set(SOURCES
+#   ConvertLaunchFuncToROCmCalls.cpp
+# )
+
+if (MLIR_ROCM_CONVERSIONS_ENABLED)
+ list(APPEND SOURCES "ConvertKernelFuncToHsaco.cpp")
+  set(AMDGPU_LIBS
+    MC
+    AMDGPUCodeGen
+    AMDGPUDesc
+    AMDGPUInfo
+  )
+
+endif()
+
+add_mlir_conversion_library(MLIRGPUtoROCmTransforms
+  ${SOURCES}
+
+  DEPENDS
+  MLIRConversionPassIncGen
+  intrinsics_gen
+
+  LINK_COMPONENTS
+  Core
+  ${AMDGPU_LIBS}
+
+  LINK_LIBS PUBLIC
+  MLIRGPU
+  MLIRIR
+  MLIRLLVMIR
+  MLIRROCDLIR
+  MLIRPass
+  MLIRSupport
+  MLIRTargetROCDLIR
+)
diff --git a/mlir/lib/Conversion/GPUToROCm/ConvertKernelFuncToHsaco.cpp b/mlir/lib/Conversion/GPUToROCm/ConvertKernelFuncToHsaco.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToROCm/ConvertKernelFuncToHsaco.cpp
@@ -0,0 +1,162 @@
+//===- ConvertKernelFuncToHsaco.cpp - MLIR GPU lowering passes ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to convert gpu kernel functions into a
+// corresponding binary blob that can be executed on a ROCm GPU. Currently
+// only translates the function itself but no dependencies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToROCm/GPUToROCmPass.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Target/ROCDLIR.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace mlir;
+
+namespace {
+static constexpr const char *kHsacoAnnotation = "rocdl.hsaco";
+
+/// A pass converting tagged kernel modules to hsaco blobs.
+///
+/// If tagged as a kernel module, each contained function is translated to ROCDL
+/// IR. A user provided HsacoGenerator compiles the IR to GPU binary code in HSA
+/// code object format, which is then attached as an attribute to the function.
+/// The function body is erased.
+class GpuKernelToHsacoPass
+    : public PassWrapper<GpuKernelToHsacoPass,
+                         OperationPass<gpu::GPUModuleOp>> {
+public:
+  GpuKernelToHsacoPass(HsacoGenerator hsacoGenerator)
+      : hsacoGenerator(hsacoGenerator) {}
+
+  void runOnOperation() override {
+    gpu::GPUModuleOp module = getOperation();
+
+    // Lock access to the llvm context.
+    llvm::sys::SmartScopedLock<true> scopedLock(
+        module.getContext()
+            ->getRegisteredDialect<LLVM::LLVMDialect>()
+            ->getLLVMContextMutex());
+
+    // Make sure the AMDGPU target is initialized.
+    LLVMInitializeAMDGPUTarget();
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUTargetMC();
+    LLVMInitializeAMDGPUAsmPrinter();
+
+    auto llvmModule = translateModuleToROCDLIR(module);
+    if (!llvmModule)
+      return signalPassFailure();
+
+    // Translate the module to HSA code object and attach the result as
+    // attribute to the module.
+    if (auto hsacoAttr = translateGPUModuleToHsacoAnnotation(
+            *llvmModule, module.getLoc(), module.getName()))
+      module.setAttr(kHsacoAnnotation, hsacoAttr);
+    else
+      signalPassFailure();
+  }
+
+private:
+  std::string translateModuleToLLVM(llvm::Module &module,
+                                    llvm::TargetMachine &target_machine);
+
+  /// Converts llvmModule to hsaco using the user-provided generator. Location
+  /// is used for error reporting and name is forwarded to the HSACO generator
+  /// to use in its logging mechanisms.
+  OwnedHsaco convertModuleToHsaco(llvm::Module &llvmModule, Location loc,
+                                  StringRef name);
+
+  /// Translates llvmModule to hsaco and returns the result as attribute.
+  StringAttr translateGPUModuleToHsacoAnnotation(llvm::Module &llvmModule,
+                                                 Location loc, StringRef name);
+
+  HsacoGenerator hsacoGenerator;
+};
+
+} // anonymous namespace
+
+std::string GpuKernelToHsacoPass::translateModuleToLLVM(
+    llvm::Module &module, llvm::TargetMachine &target_machine) {
+  std::string llvmir;
+  {
+    // Clone the llvm module into a new context to enable concurrent compilation
+    // with multiple threads.
+    llvm::LLVMContext llvmContext;
+    auto clone = LLVM::cloneModuleIntoNewContext(&llvmContext, &module);
+
+    llvm::raw_string_ostream stream(llvmir);
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegen_passes;
+    target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
+                                       llvm::CGFT_AssemblyFile);
+    codegen_passes.run(*clone);
+  }
+
+  return llvmir;
+}
+
+OwnedHsaco GpuKernelToHsacoPass::convertModuleToHsaco(llvm::Module &llvmModule,
+                                                      Location loc,
+                                                      StringRef name) {
+  std::unique_ptr<llvm::TargetMachine> targetMachine;
+  {
+    std::string error;
+    constexpr const char *rocmTriple = "amdgcn-amd-amdhsa";
+    llvm::Triple triple(rocmTriple);
+    const llvm::Target *target =
+        llvm::TargetRegistry::lookupTarget("", triple, error);
+    if (target == nullptr) {
+      emitError(loc, "cannot initialize target triple");
+      return {};
+    }
+    // TODO(whchung): be able to set mcpu.
+    targetMachine.reset(
+        target->createTargetMachine(triple.str(), "gfx900", "", {}, {}));
+  }
+
+  // Set the data layout of the llvm module to match what the ptx target needs.
+  llvmModule.setDataLayout(targetMachine->createDataLayout());
+
+  auto ptx = translateModuleToLLVM(llvmModule, *targetMachine);
+
+  return hsacoGenerator(ptx, loc, name);
+}
+
+StringAttr GpuKernelToHsacoPass::translateGPUModuleToHsacoAnnotation(
+    llvm::Module &llvmModule, Location loc, StringRef name) {
+  auto hsaco = convertModuleToHsaco(llvmModule, loc, name);
+  if (!hsaco)
+    return {};
+  return StringAttr::get({hsaco->data(), hsaco->size()}, loc->getContext());
+}
+
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
+mlir::createConvertGPUKernelToHsacoPass(HsacoGenerator hsacoGenerator) {
+  return std::make_unique<GpuKernelToHsacoPass>(hsacoGenerator);
+}
diff --git a/mlir/test/Conversion/GPUToROCm/lit.local.cfg b/mlir/test/Conversion/GPUToROCm/lit.local.cfg
new file mode 100644
--- /dev/null
+++ b/mlir/test/Conversion/GPUToROCm/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.run_rocm_tests:
+  config.unsupported = True
diff --git a/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-opt %s --test-kernel-to-hsaco -split-input-file | FileCheck %s
+
+// CHECK: attributes {rocdl.hsaco = "HSACO"}
+gpu.module @foo {
+  llvm.func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
+    // CHECK: attributes  {gpu.kernel}
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+}
+
+// -----
+
+gpu.module @bar {
+  // CHECK: func @kernel_a
+  llvm.func @kernel_a()
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+
+  // CHECK: func @kernel_b
+  llvm.func @kernel_b()
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+}
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@
   TestCallGraph.cpp
   TestConstantFold.cpp
   TestConvertGPUKernelToCubin.cpp
+  TestConvertGPUKernelToHsaco.cpp
   TestDominance.cpp
   TestLoopFusion.cpp
   TestGpuMemoryPromotion.cpp
@@ -37,6 +38,7 @@
   MLIREDSC
   MLIRGPU
   MLIRGPUtoCUDATransforms
+  MLIRGPUtoROCmTransforms
   MLIRLinalgOps
   MLIRLinalgTransforms
   MLIRSCF
diff --git a/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp b/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp
@@ -0,0 +1,31 @@
+//===- TestConvertGPUKernelToHsaco.cpp - Test gpu kernel hsaco lowering ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToROCm/GPUToROCmPass.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+using namespace mlir;
+
+#if MLIR_ROCM_CONVERSIONS_ENABLED
+static OwnedHsaco compileROCDLToHsacoForTesting(const std::string &, Location,
+                                                StringRef) {
+  const char data[] = "HSACO";
+  return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
+}
+
+namespace mlir {
+void registerTestConvertGPUKernelToHsacoPass() {
+  PassPipelineRegistration<>("test-kernel-to-hsaco",
+                             "Convert all kernel functions to ROCm HSACO blobs",
+                             [](OpPassManager &pm) {
+                               pm.addPass(createConvertGPUKernelToHsacoPass(
+                                   compileROCDLToHsacoForTesting));
+                             });
+}
+} // namespace mlir
+#endif
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -38,6 +38,7 @@
 config.run_cuda_tests = @MLIR_CUDA_CONVERSIONS_ENABLED@
 config.cuda_wrapper_library_dir = "@MLIR_CUDA_WRAPPER_LIBRARY_DIR@"
 config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@
+config.run_rocm_tests = @MLIR_ROCM_CONVERSIONS_ENABLED@
 config.vulkan_wrapper_library_dir = "@MLIR_VULKAN_WRAPPER_LIBRARY_DIR@"
 config.enable_vulkan_runner = @MLIR_VULKAN_RUNNER_ENABLED@
 
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -46,6 +46,7 @@
 void registerTestCallGraphPass();
 void registerTestConstantFold();
 void registerTestConvertGPUKernelToCubinPass();
+void registerTestConvertGPUKernelToHsacoPass();
 void registerTestDominancePass();
 void registerTestFunc();
 void registerTestGpuMemoryPromotionPass();
@@ -111,6 +112,9 @@
   registerTestConstantFold();
 #if MLIR_CUDA_CONVERSIONS_ENABLED
   registerTestConvertGPUKernelToCubinPass();
+#endif
+#if MLIR_ROCM_CONVERSIONS_ENABLED
+  registerTestConvertGPUKernelToHsacoPass();
 #endif
   registerTestBufferPlacementPreparationPass();
   registerTestDominancePass();