diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -70,6 +70,32 @@
 }
 
 namespace gpu {
+
+/// Options for Serialization
+struct SerializationToCubinOptions {
+  /// LLVM target triple
+  std::string triple;
+
+  /// SM Architecture of the GPU
+  std::string chip;
+
+  /// PTX version that is wanted to produce
+  std::string features;
+
+  /// Optimization level
+  int optLevel = 2;
+
+  /// Dump generated PTX to stderr for debug purposes
+  bool dumpPtx = false;
+
+  /// Compiles generated PTX by ptxas compiler. When it is false, the generated
+  /// PTX is compilet by JIT compielr by the driver.
+  bool usePtxas = true;
+
+  /// Parameters to pass ptxas compiler. It is ignored for JIT compiler.
+  std::string ptxasParams;
+};
+
 /// Base pass class to serialize kernel functions through LLVM into
 /// user-specified IR and add the resulting blob as module attribute.
 class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
@@ -117,9 +143,18 @@
       *this, "gpu-binary-annotation",
       llvm::cl::desc("Annotation attribute string for GPU binary"),
       llvm::cl::init(getDefaultGpuBinaryAnnotation())};
+
   Option<bool> dumpPtx{*this, "dump-ptx",
                        ::llvm::cl::desc("Dump generated PTX"),
                        llvm::cl::init(false)};
+
+  Option<bool> usePtxas{
+      *this, "use-ptxas",
+      ::llvm::cl::desc("Compile generated PTX by ptxas compiler"),
+      llvm::cl::init(true)};
+  Option<std::string> ptxasParams{
+      *this, "ptxas-params",
+      ::llvm::cl::desc("Parameters to pass ptxas compiler")};
 };
 } // namespace gpu
 
@@ -137,11 +172,8 @@
 
 /// Create an instance of the GPU kernel function to CUBIN binary serialization
 /// pass with optLevel (default level 2).
-std::unique_ptr<Pass> createGpuSerializeToCubinPass(StringRef triple,
-                                                    StringRef chip,
-                                                    StringRef features,
-                                                    int optLevel = 2,
-                                                    bool dumpPtx = false);
+std::unique_ptr<Pass>
+createGpuSerializeToCubinPass(const gpu::SerializationToCubinOptions &options);
 
 /// Create an instance of the GPU kernel function to HSAco binary serialization
 /// pass.
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
@@ -12,7 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
 
 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
 #include "mlir/Pass/Pass.h"
@@ -36,6 +43,106 @@
                      .concat("]"));
 }
 
+static constexpr char kPtxasCompilerName[] = "ptxas";
+
+/// Compiles the given generated PTX code with the given ptxas compiler.
+static FailureOr<std::string>
+compileWithPtxas(StringRef smCapability, StringRef ptxasParams,
+                 StringRef ptxSource, bool dumpPtx, std::string *message) {
+  // Step 0. Find ptxas compiler
+  std::optional<std::string> ptxasCompiler =
+      llvm::sys::Process::FindInEnvPath("PATH", kPtxasCompilerName);
+  if (!ptxasCompiler.has_value())
+    return failure();
+
+  // Step 1. Create temporary files: ptx source file, log file and cubin file
+  llvm::SmallString<64> ptxSourceFile, stdinFile, stdoutFile, stderrFile;
+  llvm::sys::fs::createTemporaryFile("mlir-ptx", "", ptxSourceFile);
+  llvm::sys::fs::createTemporaryFile("ptxas-stdin", "", stdinFile);
+  llvm::sys::fs::createTemporaryFile("ptxas-stdout", "", stdoutFile);
+  llvm::sys::fs::createTemporaryFile("ptxas-stderr", "", stderrFile);
+  std::string cubinFile = std::string(ptxSourceFile) + ".cubin";
+  llvm::FileRemover stdinRemover(stdinFile.c_str());
+  llvm::FileRemover stdoutRemover(stdoutFile.c_str());
+  llvm::FileRemover stderrRemover(stderrFile.c_str());
+  llvm::FileRemover binRemover(cubinFile.c_str());
+  llvm::FileRemover srcRemover(ptxSourceFile.c_str());
+
+  // Step 2. Write the generated PTX into a file, so we can pass it  to ptxas
+  // compiler
+  std::error_code ec;
+  llvm::raw_fd_ostream fPtxSource(ptxSourceFile, ec);
+  fPtxSource << ptxSource;
+  fPtxSource.close();
+  if (fPtxSource.has_error()) {
+    *message = std::string(
+        "Could not write the generated ptx into a temporary file\n");
+    return failure();
+  }
+
+  // Step 3. Build the ptxas command  line
+  std::vector<StringRef> argVector{StringRef("ptxas"), StringRef("-arch"),
+                                   smCapability,       StringRef(ptxSourceFile),
+                                   StringRef("-o"),    StringRef(cubinFile)};
+#ifdef _WIN32
+  auto tokenize = llvm::cl::TokenizeWindowsCommandLine;
+#else
+  auto tokenize = llvm::cl::TokenizeGNUCommandLine;
+#endif // _WIN32
+  llvm::BumpPtrAllocator scratchAllocator;
+  llvm::StringSaver stringSaver(scratchAllocator);
+  SmallVector<const char *> rawArgs;
+  tokenize(ptxasParams, stringSaver, rawArgs, /*MarkEOLs=*/false);
+  for (const auto *rawArg : rawArgs)
+    argVector.emplace_back(rawArg);
+
+  std::optional<StringRef> redirects[] = {
+      stdinFile.str(),
+      stdoutFile.str(),
+      stderrFile.str(),
+  };
+
+  // Step 4. Invoke ptxas
+  if (llvm::sys::ExecuteAndWait(ptxasCompiler.value(),
+                                llvm::ArrayRef<llvm::StringRef>(argVector),
+                                /*Env=*/std::nullopt,
+                                /*Redirects=*/redirects,
+                                /*SecondsToWait=*/0,
+                                /*MemoryLimit=*/0,
+                                /*ErrMsg=*/message)) {
+    if (message->empty()) {
+      llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeErrorlog =
+          llvm::MemoryBuffer::getFile(stderrFile);
+      *message = std::string("Invoking ptxas is failed, see the file: ");
+      if (maybeErrorlog)
+        *message += maybeErrorlog->get()->getBuffer().str();
+    }
+    stderrRemover.releaseFile();
+    return failure();
+  }
+
+  // Step 5. The output of ptxas if  verbose flag is set. This is useful
+  // because it shows local memory usage, register usage, and etc.
+  if (dumpPtx) {
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFlog =
+        llvm::MemoryBuffer::getFile(stderrFile);
+    if (maybeFlog) {
+      llvm::WithColor::note() << maybeFlog->get()->getBuffer().str();
+    }
+  }
+
+  // Step 6. Read the cubin file, and return. It will eventually be written
+  // into executable.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFcubin =
+      llvm::MemoryBuffer::getFile(cubinFile);
+  if (!maybeFcubin) {
+    *message = std::string("Could not read cubin file \n");
+    return failure();
+  }
+
+  return std::string(maybeFcubin->get()->getBuffer());
+}
+
 #define RETURN_ON_CUDA_ERROR(expr)                                             \
   do {                                                                         \
     if (auto status = (expr)) {                                                \
@@ -54,11 +161,13 @@
 
   SerializeToCubinPass(StringRef triple = "nvptx64-nvidia-cuda",
                        StringRef chip = "sm_35", StringRef features = "+ptx60",
-                       int optLevel = 2, bool dumpPtx = false);
+                       int optLevel = 2, bool dumpPtx = false,
+                       bool usePtxas = true, StringRef ptxasParams = {});
 
   StringRef getArgument() const override { return "gpu-to-cubin"; }
   StringRef getDescription() const override {
-    return "Lower GPU kernel function to CUBIN binary annotations";
+    return "Lower GPU kernel function to CUBIN binary "
+           "annotations";
   }
 
 private:
@@ -80,9 +189,10 @@
 
 SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip,
                                            StringRef features, int optLevel,
-                                           bool dumpPtx) {
-  // No matter how this pass is constructed, ensure that the NVPTX backend
-  // is initialized exactly once.
+                                           bool dumpPtx, bool usePtxas,
+                                           StringRef ptxasParams) {
+  // No matter how this pass is constructed, ensure that
+  // the NVPTX backend is initialized exactly once.
   llvm::call_once(initializeBackendOnce, []() {
     // Initialize LLVM NVPTX backend.
     LLVMInitializeNVPTXTarget();
@@ -94,7 +204,9 @@
   maybeSetOption(this->triple, triple);
   maybeSetOption(this->chip, chip);
   maybeSetOption(this->features, features);
+  maybeSetOption(this->ptxasParams, ptxasParams);
   this->dumpPtx = dumpPtx;
+  this->usePtxas = usePtxas;
   if (this->optLevel.getNumOccurrences() == 0)
     this->optLevel.setValue(optLevel);
 }
@@ -112,7 +224,8 @@
 
   RETURN_ON_CUDA_ERROR(cuInit(0));
 
-  // Linking requires a device context.
+  // Linking requires a device
+  // context.
   CUdevice device;
   RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
   CUcontext context;
@@ -131,9 +244,24 @@
 
   auto kernelName = getOperation().getName().str();
   if (dumpPtx) {
-    llvm::dbgs() << " Kernel Name : [" << kernelName << "]\n";
-    llvm::dbgs() << isa << "\n";
+    llvm::errs() << "// Kernel Name : [" << kernelName << "]\n";
+    llvm::errs() << isa << "\n";
   }
+
+  if (usePtxas) {
+    // Try to compile it with ptxas first.
+    std::string message;
+    FailureOr<std::string> maybeCubinImage =
+        compileWithPtxas(this->chip, ptxasParams, isa, dumpPtx, &message);
+    if (succeeded(maybeCubinImage)) {
+      return std::make_unique<std::vector<char>>(
+          maybeCubinImage.value().begin(), maybeCubinImage.value().end());
+    }
+    emitError(loc) << message;
+    return {};
+  }
+
+  // Fallback to JIT compilation if ptxas fails.
   RETURN_ON_CUDA_ERROR(cuLinkAddData(
       linkState, CUjitInputType::CU_JIT_INPUT_PTX,
       const_cast<void *>(static_cast<const void *>(isa.c_str())), isa.length(),
@@ -150,7 +278,7 @@
   auto result =
       std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
 
-  // This will also destroy the cubin data.
+  // This will also destroy the cubin  data.
   RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
   RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));
 
@@ -159,17 +287,22 @@
 
 // Register pass to serialize GPU kernel functions to a CUBIN binary annotation.
 void mlir::registerGpuSerializeToCubinPass() {
-  PassRegistration<SerializeToCubinPass> registerSerializeToCubin(
-      [] { return std::make_unique<SerializeToCubinPass>(); });
+  PassRegistration<SerializeToCubinPass> registerSerializeToCubin([] {
+    // Initialize LLVM NVPTX backend.
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXAsmPrinter();
+
+    return std::make_unique<SerializeToCubinPass>();
+  });
 }
 
-std::unique_ptr<Pass> mlir::createGpuSerializeToCubinPass(StringRef triple,
-                                                          StringRef arch,
-                                                          StringRef features,
-                                                          int optLevel,
-                                                          bool dumpPtx) {
-  return std::make_unique<SerializeToCubinPass>(triple, arch, features,
-                                                optLevel, dumpPtx);
+std::unique_ptr<Pass> mlir::createGpuSerializeToCubinPass(
+    const gpu::SerializationToCubinOptions &options) {
+  return std::make_unique<SerializeToCubinPass>(
+      options.triple, options.chip, options.features, options.optLevel,
+      options.dumpPtx, options.usePtxas, options.ptxasParams);
 }
 
 #else  // MLIR_GPU_TO_CUBIN_PASS_ENABLE
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -102,8 +102,12 @@
   // Finalize GPU code generation.
   if (gpuCodegen) {
 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
-    pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
-        options.gpuTriple, options.gpuChip, options.gpuFeatures));
+    gpu::SerializationToCubinOptions cubinOptions;
+    cubinOptions.triple = options.gpuTriple;
+    cubinOptions.chip = options.gpuChip;
+    cubinOptions.features = options.gpuFeatures;
+    pm.addNestedPass<gpu::GPUModuleOp>(
+        createGpuSerializeToCubinPass(cubinOptions));
 #endif
     pm.addPass(createGpuToLLVMConversionPass());
   }
diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -172,8 +172,12 @@
   pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
 
 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
-  pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
-      options.cubinTriple, options.cubinChip, options.cubinFeatures));
+  gpu::SerializationToCubinOptions cubinOptions;
+  cubinOptions.triple = options.cubinTriple;
+  cubinOptions.chip = options.cubinChip;
+  cubinOptions.features = options.cubinFeatures;
+  pm.addNestedPass<gpu::GPUModuleOp>(
+      createGpuSerializeToCubinPass(cubinOptions));
 #endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE
 }