diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -4,6 +4,7 @@
 set(LLVM_OPTIONAL_SOURCES
   AsyncRuntime.cpp
   CRunnerUtils.cpp
+  CudaRuntimeWrappers.cpp
   SparseUtils.cpp
   ExecutionEngine.cpp
   RunnerUtils.cpp
@@ -102,3 +103,34 @@
 set_property(TARGET mlir_async_runtime PROPERTY CXX_VISIBILITY_PRESET hidden)
 target_compile_definitions(mlir_async_runtime PRIVATE mlir_async_runtime_EXPORTS)
 
+if(MLIR_CUDA_RUNNER_ENABLED)
+  # Configure CUDA support. Using check_language first allows us to give a
+  # custom error message.
+  include(CheckLanguage)
+  check_language(CUDA)
+  if (CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+  else()
+    message(SEND_ERROR
+      "Building the mlir cuda runner requires a working CUDA install")
+  endif()
+
+  # We need the libcuda.so library.
+  find_library(CUDA_RUNTIME_LIBRARY cuda)
+
+  add_mlir_library(mlir_cuda_runtime
+    SHARED
+    CudaRuntimeWrappers.cpp
+
+    EXCLUDE_FROM_LIBMLIR
+  )
+  set_property(TARGET mlir_cuda_runtime PROPERTY CXX_STANDARD 14)
+  target_include_directories(mlir_cuda_runtime
+    PRIVATE
+    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+  )
+  target_link_libraries(mlir_cuda_runtime
+    PRIVATE
+    ${CUDA_RUNTIME_LIBRARY}
+  )
+endif()
diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
rename from mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
rename to mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
--- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -1,4 +1,4 @@
-//===- cuda-runtime-wrappers.cpp - MLIR CUDA runner wrapper library -------===//
+//===- CudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -21,8 +21,7 @@
 set(MLIR_RUNNER_UTILS_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 
 # Passed to lit.site.cfg.py.in to set up the path where to find the libraries
-# for the mlir cuda / rocm / spirv / vulkan runner tests.
-set(MLIR_CUDA_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+# for the mlir rocm / spirv / vulkan runner tests.
 set(MLIR_ROCM_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 set(MLIR_SPIRV_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 set(MLIR_VULKAN_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
@@ -70,6 +69,10 @@
   mlir_async_runtime
   )
 
+if(MLIR_CUDA_RUNNER_ENABLED)
+  list(APPEND MLIR_TEST_DEPENDS mlir_cuda_runtime)
+endif()
+
 list(APPEND MLIR_TEST_DEPENDS MLIRUnitTests)
 
 if(LLVM_BUILD_EXAMPLES)
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
rename from mlir/test/mlir-cuda-runner/all-reduce-and.mlir
rename to mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-cuda-runner %s \
 // RUN:   -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \
 // RUN:   -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
rename from mlir/test/mlir-cuda-runner/all-reduce-max.mlir
rename to mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-cuda-runner %s \
 // RUN:   -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \
 // RUN:   -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
rename from mlir/test/mlir-cuda-runner/all-reduce-min.mlir
rename to mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-cuda-runner %s \
 // RUN:   -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \
 // RUN:   -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
rename from mlir/test/mlir-cuda-runner/all-reduce-op.mlir
rename to mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-cuda-runner %s \
 // RUN:   -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \
 // RUN:   -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
rename from mlir/test/mlir-cuda-runner/all-reduce-or.mlir
rename to mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-cuda-runner %s \
 // RUN:   -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \
 // RUN:   -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
rename from mlir/test/mlir-cuda-runner/all-reduce-region.mlir
rename to mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-cuda-runner %s \
 // RUN:   -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \
 // RUN:   -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
rename from mlir/test/mlir-cuda-runner/all-reduce-xor.mlir
rename to mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
--- a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-cuda-runner %s \
 // RUN:   -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \
 // RUN:   -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cuda-runner/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir
rename from mlir/test/mlir-cuda-runner/async.mlir
rename to mlir/test/Integration/GPU/CUDA/async.mlir
--- a/mlir/test/mlir-cuda-runner/async.mlir
+++ b/mlir/test/Integration/GPU/CUDA/async.mlir
@@ -3,7 +3,7 @@
 // RUN:   -gpu-async-region -async-ref-counting \
 // RUN:   -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
 // RUN:   -async-to-async-runtime -convert-async-to-llvm -convert-std-to-llvm \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_async_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void -O0 \
diff --git a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
rename from mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
rename to mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
--- a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
+++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-cuda-runner %s \
 // RUN:   -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \
 // RUN:   -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cuda-runner/lit.local.cfg b/mlir/test/Integration/GPU/CUDA/lit.local.cfg
rename from mlir/test/mlir-cuda-runner/lit.local.cfg
rename to mlir/test/Integration/GPU/CUDA/lit.local.cfg
diff --git a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
rename from mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir
rename to mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
--- a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir
+++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-cuda-runner %s \
 // RUN:   -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \
 // RUN:   -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cuda-runner/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
rename from mlir/test/mlir-cuda-runner/shuffle.mlir
rename to mlir/test/Integration/GPU/CUDA/shuffle.mlir
--- a/mlir/test/mlir-cuda-runner/shuffle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-cuda-runner %s \
 // RUN:   -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \
 // RUN:   -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cuda-runner/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
rename from mlir/test/mlir-cuda-runner/two-modules.mlir
rename to mlir/test/Integration/GPU/CUDA/two-modules.mlir
--- a/mlir/test/mlir-cuda-runner/two-modules.mlir
+++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-cuda-runner %s \
 // RUN:   -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \
 // RUN:   -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \
-// RUN:   --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
+// RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -75,7 +75,6 @@
     ToolSubst('toy-ch3', unresolved='ignore'),
     ToolSubst('toy-ch4', unresolved='ignore'),
     ToolSubst('toy-ch5', unresolved='ignore'),
-    ToolSubst('%cuda_wrapper_library_dir', config.cuda_wrapper_library_dir, unresolved='ignore'),
     ToolSubst('%linalg_test_lib_dir', config.linalg_test_lib_dir, unresolved='ignore'),
     ToolSubst('%mlir_runner_utils_dir', config.mlir_runner_utils_dir, unresolved='ignore'),
     ToolSubst('%rocm_wrapper_library_dir', config.rocm_wrapper_library_dir, unresolved='ignore'),
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -36,7 +36,6 @@
 config.linalg_test_lib_dir = "@MLIR_DIALECT_LINALG_INTEGRATION_TEST_LIB_DIR@"
 config.build_examples = @LLVM_BUILD_EXAMPLES@
 config.run_cuda_tests = @MLIR_CUDA_CONVERSIONS_ENABLED@
-config.cuda_wrapper_library_dir = "@MLIR_CUDA_WRAPPER_LIBRARY_DIR@"
 config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@
 config.run_rocm_tests = @MLIR_ROCM_CONVERSIONS_ENABLED@
 config.rocm_wrapper_library_dir = "@MLIR_ROCM_WRAPPER_LIBRARY_DIR@"
diff --git a/mlir/tools/mlir-cuda-runner/CMakeLists.txt b/mlir/tools/mlir-cuda-runner/CMakeLists.txt
--- a/mlir/tools/mlir-cuda-runner/CMakeLists.txt
+++ b/mlir/tools/mlir-cuda-runner/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(LLVM_OPTIONAL_SOURCES
-  cuda-runtime-wrappers.cpp
   mlir-cuda-runner.cpp
   )
 set(LLVM_LINK_COMPONENTS
@@ -27,21 +26,6 @@
   # We need the libcuda.so library.
   find_library(CUDA_RUNTIME_LIBRARY cuda)
 
-  add_mlir_library(cuda-runtime-wrappers
-    SHARED
-    cuda-runtime-wrappers.cpp
-
-    EXCLUDE_FROM_LIBMLIR
-  )
-  target_include_directories(cuda-runtime-wrappers
-    PRIVATE
-    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
-  )
-  target_link_libraries(cuda-runtime-wrappers
-    PRIVATE
-    ${CUDA_RUNTIME_LIBRARY}
-  )
-
   get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
   set(LIBS
     ${conversion_libs}
@@ -79,7 +63,7 @@
     mlir-cuda-runner.cpp
 
     DEPENDS
-    cuda-runtime-wrappers
+    mlir_cuda_runtime
     )
   target_include_directories(mlir-cuda-runner
     PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}