diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h
--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -87,6 +87,12 @@
       *this, "gpu-binary-annotation",
       llvm::cl::desc("Annotation attribute string for GPU binary"),
       llvm::cl::init(getDefaultGpuBinaryAnnotation())};
+  Option<bool> dumpAsm{
+      *this, "dump-asm",
+      llvm::cl::desc("Whether the final generated instructions or intermediate "
+                     "IR (if stopping early) for a kernel should be dumped to "
+                     "the debug stream"),
+      llvm::cl::init(false)};
 };
 } // namespace gpu
 
@@ -102,6 +108,14 @@
 /// annotation.
 void registerGpuSerializeToHsacoPass();
 
+/// Create an instance of the GPU kernel function to HSAco binary serialization
+/// pass
+std::unique_ptr<Pass> createGpuSerializeToHsacoPass(StringRef triple,
+                                                    StringRef arch,
+                                                    StringRef features,
+                                                    int optLevel,
+                                                    bool dumpAsm = false);
+
 /// Generate the code for registering passes.
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/GPU/Passes.h.inc"
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
@@ -21,8 +21,15 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 
+#include <mutex>
+#include <string>
+
 using namespace mlir;
 
+// Ensure multiple threads don't try to simultaneously dump the assembly for
+// separate modules
+static std::mutex dumpAsmLock;
+
 std::string gpu::getDefaultGpuBinaryAnnotation() { return "gpu.binary"; }
 
 gpu::SerializeToBlobPass::SerializeToBlobPass(TypeID passID)
@@ -76,6 +83,12 @@
 
   std::string targetISA = std::move(maybeTargetISA.getValue());
 
+  if (dumpAsm.getValue()) {
+    const std::lock_guard<std::mutex> lock(dumpAsmLock);
+    llvm::dbgs() << targetISA << "\n";
+    llvm::dbgs().flush();
+  }
+
   // Serialize the target ISA.
   std::unique_ptr<std::vector<char>> blob = serializeISA(targetISA);
   if (!blob)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
@@ -61,7 +61,7 @@
     : public PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass> {
 public:
   SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features,
-                       int optLevel);
+                       int optLevel, bool dumpAsm = false);
   SerializeToHsacoPass(const SerializeToHsacoPass &other);
   StringRef getArgument() const override { return "gpu-to-hsaco"; }
   StringRef getDescription() const override {
@@ -127,12 +127,15 @@
 }
 
 SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch,
-                                           StringRef features, int optLevel) {
+                                           StringRef features, int optLevel,
+                                           bool dumpAsm) {
   maybeSetOption(this->triple, [&triple] { return triple.str(); });
   maybeSetOption(this->chip, [&arch] { return arch.str(); });
   maybeSetOption(this->features, [&features] { return features.str(); });
   if (this->optLevel.getNumOccurrences() == 0)
     this->optLevel.setValue(optLevel);
+  if (this->dumpAsm.getNumOccurrences() == 0 && dumpAsm)
+    this->dumpAsm.setValue(dumpAsm);
 }
 
 void SerializeToHsacoPass::getDependentDialects(
@@ -479,6 +482,18 @@
                                                       "", 2);
       });
 }
+
+/// Create an instance of the GPU kernel function to HSAco binary serialization
+/// pass
+std::unique_ptr<Pass> mlir::createGpuSerializeToHsacoPass(StringRef triple,
+                                                          StringRef arch,
+                                                          StringRef features,
+                                                          int optLevel,
+                                                          bool dumpASM) {
+  return std::make_unique<SerializeToHsacoPass>(triple, arch, features,
+                                                optLevel, dumpASM);
+}
+
 #else  // MLIR_GPU_TO_HSACO_PASS_ENABLE
 void mlir::registerGpuSerializeToHsacoPass() {}
 #endif // MLIR_GPU_TO_HSACO_PASS_ENABLE
diff --git a/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir
--- a/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir
+++ b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir
@@ -1,6 +1,9 @@
 // RUN: mlir-opt %s --test-gpu-to-hsaco | FileCheck %s
+// RUN: mlir-opt %s --test-gpu-to-hsaco=dump-asm=true 2>&1 |\
+// RUN:   FileCheck %s --check-prefix=CHECK-ASM
 
 // CHECK: gpu.module @foo attributes {gpu.binary = "HSACO"}
+// CHECK-ASM: .globl kernel
 gpu.module @foo {
   llvm.func @kernel(%arg0 : f32, %arg1 : !llvm.ptr<f32>)
     // CHECK: attributes  {gpu.kernel}
@@ -23,3 +26,4 @@
     llvm.return
   }
 }
+// CHECK-ASM: amdhsa.target: