diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -91,7 +91,8 @@
     message(SEND_ERROR
       "Building mlir with cuda support requires the NVPTX backend")
   endif()
-
+  
+  find_package(CUDAToolkit)
   # Configure CUDA language support. Using check_language first allows us to
   # give a custom error message.
   include(CheckLanguage)
@@ -115,12 +116,12 @@
     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
   )
 
-  find_library(CUDA_DRIVER_LIBRARY cuda)
+  #find_library(CUDA_NVPTX_LIBRARY nvptxcompiler_static)
 
   target_link_libraries(MLIRGPUTransforms
     PRIVATE
     MLIRNVVMToLLVMIRTranslation
-    ${CUDA_DRIVER_LIBRARY}
+    CUDA::nvptxcompiler_static
   )
 
 endif()
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
@@ -20,24 +20,63 @@
 #include "llvm/Support/TargetSelect.h"
 
 #include <cuda.h>
+#include <nvPTXCompiler.h>
 
 using namespace mlir;
 
-static void emitCudaError(const llvm::Twine &expr, const char *buffer,
-                          CUresult result, Location loc) {
+static void emitNvptxError(const llvm::Twine &expr,
+                           nvPTXCompilerHandle compiler,
+                           nvPTXCompileResult result, Location loc) {
   const char *error;
-  cuGetErrorString(result, &error);
+  auto GetErrMsg = [](nvPTXCompileResult result) -> const char * {
+    switch (result) {
+    case NVPTXCOMPILE_SUCCESS:
+      return "Success";
+    case NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE:
+      return "Invalid compiler handle";
+    case NVPTXCOMPILE_ERROR_INVALID_INPUT:
+      return "Invalid input";
+    case NVPTXCOMPILE_ERROR_COMPILATION_FAILURE:
+      return "Compilation failure";
+    case NVPTXCOMPILE_ERROR_INTERNAL:
+      return "Internal error";
+    case NVPTXCOMPILE_ERROR_OUT_OF_MEMORY:
+      return "Out of memory";
+    case NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE:
+      return "Invocation incomplete";
+    case NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION:
+      return "Unsupported PTX version";
+    }
+  };
+  size_t errorSize;
+  auto status = nvPTXCompilerGetErrorLogSize(compiler, &errorSize);
+  std::string error_log;
+  if (status == NVPTXCOMPILE_SUCCESS) {
+    error_log.resize(errorSize);
+    status = nvPTXCompilerGetErrorLog(compiler, error_log.data());
+    if (status != NVPTXCOMPILE_SUCCESS)
+      error_log = "<failed to retrieve compiler error log>";
+  }
   emitError(loc, expr.concat(" failed with error code ")
-                     .concat(llvm::Twine{error})
+                     .concat(llvm::Twine{GetErrMsg(result)})
                      .concat("[")
-                     .concat(buffer)
+                     .concat(error_log)
                      .concat("]"));
 }
 
 #define RETURN_ON_CUDA_ERROR(expr)                                             \
   do {                                                                         \
     if (auto status = (expr)) {                                                \
-      emitCudaError(#expr, jitErrorBuffer, status, loc);                       \
+      emitNvptxError(#expr, compiler, status, loc);                            \
+      return {};                                                               \
+    }                                                                          \
+  } while (false)
+
+#define RETURN_ON_NVPTX_ERROR(expr)                                            \
+  do {                                                                         \
+    nvPTXCompileResult result = (expr);                                        \
+    if (result != NVPTXCOMPILE_SUCCESS) {                                      \
+      emitNvptxError(#expr, compiler, result, loc);                            \
       return {};                                                               \
     }                                                                          \
   } while (false)
@@ -88,46 +127,17 @@
 SerializeToCubinPass::serializeISA(const std::string &isa) {
   Location loc = getOperation().getLoc();
   char jitErrorBuffer[4096] = {0};
+  nvPTXCompilerHandle compiler;
+  nvPTXCompilerCreate(&compiler, isa.length(), isa.c_str());
 
-  RETURN_ON_CUDA_ERROR(cuInit(0));
-
-  // Linking requires a device context.
-  CUdevice device;
-  RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
-  CUcontext context;
-  RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device));
-  CUlinkState linkState;
-
-  CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
-                               CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
-  void *jitOptionsVals[] = {jitErrorBuffer,
-                            reinterpret_cast<void *>(sizeof(jitErrorBuffer))};
-
-  RETURN_ON_CUDA_ERROR(cuLinkCreate(2,              /* number of jit options */
-                                    jitOptions,     /* jit options */
-                                    jitOptionsVals, /* jit option values */
-                                    &linkState));
-
-  auto kernelName = getOperation().getName().str();
-  RETURN_ON_CUDA_ERROR(cuLinkAddData(
-      linkState, CUjitInputType::CU_JIT_INPUT_PTX,
-      const_cast<void *>(static_cast<const void *>(isa.c_str())), isa.length(),
-      kernelName.c_str(), 0, /* number of jit options */
-      nullptr,               /* jit options */
-      nullptr                /* jit option values */
-      ));
-
-  void *cubinData;
-  size_t cubinSize;
-  RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize));
+  nvPTXCompilerCompile(compiler, 0, nullptr);
 
-  char *cubinAsChar = static_cast<char *>(cubinData);
-  auto result =
-      std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
+  size_t cubinSize;
+  nvPTXCompilerGetCompiledProgramSize(compiler, &cubinSize);
 
-  // This will also destroy the cubin data.
-  RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
-  RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));
+  auto result = std::make_unique<std::vector<char>>(cubinSize);
+  nvPTXCompilerGetCompiledProgram(compiler, result->data());
+  nvPTXCompilerDestroy(&compiler);
 
   return result;
 }
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -169,6 +169,7 @@
   target_compile_definitions(mlir_async_runtime PRIVATE mlir_async_runtime_EXPORTS)
 
   if(MLIR_ENABLE_CUDA_RUNNER)
+    find_package(CUDAToolkit)
     # Configure CUDA support. Using check_language first allows us to give a
     # custom error message.
     include(CheckLanguage)
@@ -180,9 +181,6 @@
         "Building the mlir cuda runner requires a working CUDA install")
     endif()
 
-    # We need the libcuda.so library.
-    find_library(CUDA_RUNTIME_LIBRARY cuda)
-
     add_mlir_library(mlir_cuda_runtime
       SHARED
       CudaRuntimeWrappers.cpp
@@ -196,7 +194,7 @@
     )
     target_link_libraries(mlir_cuda_runtime
       PRIVATE
-      ${CUDA_RUNTIME_LIBRARY}
+      CUDA::cuda_driver
     )
   endif()