diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -117,6 +117,9 @@
       *this, "gpu-binary-annotation",
       llvm::cl::desc("Annotation attribute string for GPU binary"),
       llvm::cl::init(getDefaultGpuBinaryAnnotation())};
+  Option<bool> dumpPtx{*this, "dump-ptx",
+                       ::llvm::cl::desc("Dump generated PTX"),
+                       llvm::cl::init(false)};
 };
 } // namespace gpu
 
@@ -137,7 +140,8 @@
 std::unique_ptr<Pass> createGpuSerializeToCubinPass(StringRef triple,
                                                     StringRef chip,
                                                     StringRef features,
-                                                    int optLevel = 2);
+                                                    int optLevel = 2,
+                                                    bool dumpPtx = false);
 
 /// Create an instance of the GPU kernel function to HSAco binary serialization
 /// pass.
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "llvm/Support/Debug.h"
 
 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
 #include "mlir/Pass/Pass.h"
@@ -50,7 +51,7 @@
 
   SerializeToCubinPass(StringRef triple = "nvptx64-nvidia-cuda",
                        StringRef chip = "sm_35", StringRef features = "+ptx60",
-                       int optLevel = 2);
+                       int optLevel = 2, bool dumpPtx = false);
 
   StringRef getArgument() const override { return "gpu-to-cubin"; }
   StringRef getDescription() const override {
@@ -73,10 +74,12 @@
 }
 
 SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip,
-                                           StringRef features, int optLevel) {
+                                           StringRef features, int optLevel,
+                                           bool dumpPtx) {
   maybeSetOption(this->triple, triple);
   maybeSetOption(this->chip, chip);
   maybeSetOption(this->features, features);
+  this->dumpPtx = dumpPtx;
   if (this->optLevel.getNumOccurrences() == 0)
     this->optLevel.setValue(optLevel);
 }
@@ -112,6 +115,10 @@
                                     &linkState));
 
   auto kernelName = getOperation().getName().str();
+  if (dumpPtx) {
+    llvm::dbgs() << " Kernel Name : [" << kernelName << "]\n";
+    llvm::dbgs() << isa << "\n";
+  }
   RETURN_ON_CUDA_ERROR(cuLinkAddData(
       linkState, CUjitInputType::CU_JIT_INPUT_PTX,
       const_cast<void *>(static_cast<const void *>(isa.c_str())), isa.length(),
@@ -151,9 +158,10 @@
 std::unique_ptr<Pass> mlir::createGpuSerializeToCubinPass(StringRef triple,
                                                           StringRef arch,
                                                           StringRef features,
-                                                          int optLevel) {
+                                                          int optLevel,
+                                                          bool dumpPtx) {
   return std::make_unique<SerializeToCubinPass>(triple, arch, features,
-                                                optLevel);
+                                                optLevel, dumpPtx);
 }
 
 #else  // MLIR_GPU_TO_CUBIN_PASS_ENABLE
diff --git a/mlir/test/Conversion/GPUToCUDA/dump-ptx.mlir b/mlir/test/Conversion/GPUToCUDA/dump-ptx.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Conversion/GPUToCUDA/dump-ptx.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{dump-ptx}))' \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: Generated by LLVM NVPTX Back-End
+// CHECK: .visible .func kernel_a()
+// CHECK: ret;
+
+gpu.module @bar {
+  llvm.func @kernel_a()
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+}