diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
--- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
@@ -15,12 +15,6 @@
 elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
   libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.")
   return()
-elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND)
-  libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.")
-  return()
-elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
-  libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.")
-  return()
 endif()
 
 libomptarget_say("Building CUDA offloading plugin.")
@@ -28,7 +22,6 @@
 # Define the suffix for the runtime messaging dumps.
 add_definitions(-DTARGET_NAME=CUDA)
 
-include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
 include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
 
 add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
@@ -39,7 +32,7 @@
 target_link_libraries(omptarget.rtl.cuda
   elf_common
   MemoryManager
-  ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
+  ${CMAKE_DL_LIBS}
   ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
   "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
   "-Wl,-z,defs")
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -12,7 +12,8 @@
 
 #include <cassert>
 #include <cstddef>
-#include <cuda.h>
+#include <cstdint>
+#include <dlfcn.h>
 #include <list>
 #include <memory>
 #include <mutex>
@@ -27,6 +28,161 @@
 
 #include "MemoryManager.h"
 
+/// To avoid a cuda.h and linktime libcuda.so dependence we declare used values
+/// outselves and load the runtime functions we need at runtime.
+///
+///{
+
+namespace {
+
+typedef int CUdevice;
+typedef uintptr_t CUdeviceptr;
+typedef struct CUmod_st *CUmodule;
+typedef struct CUctx_st *CUcontext;
+typedef struct CUfunc_st *CUfunction;
+typedef struct CUstream_st *CUstream;
+
+typedef enum cudaError_enum {
+  CUDA_SUCCESS = 0,
+  CUDA_ERROR_INVALID_VALUE = 1,
+} CUresult;
+
+typedef enum CUstream_flags_enum {
+  CU_STREAM_DEFAULT = 0x0,
+  CU_STREAM_NON_BLOCKING = 0x1,
+} CUstream_flags;
+
+typedef enum CUdevice_attribute_enum {
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
+  CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
+} CUdevice_attribute;
+
+typedef enum CUfunction_attribute_enum {
+  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+} CUfunction_attribute;
+
+typedef enum CUctx_flags_enum {
+  CU_CTX_SCHED_BLOCKING_SYNC = 0x04,
+  CU_CTX_SCHED_MASK = 0x07,
+} CUctx_flags;
+
+#define cuMemFree cuMemFree_v2
+#define cuMemAlloc cuMemAlloc_v2
+#define cuMemcpyDtoH cuMemcpyDtoH_v2
+#define cuMemcpyHtoD cuMemcpyHtoD_v2
+#define cuStreamDestroy cuStreamDestroy_v2
+#define cuModuleGetGlobal cuModuleGetGlobal_v2
+#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
+#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
+#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
+#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2
+#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2
+
+#define DECLARE_CUDA_FUNCTION(NAME, ...) CUresult (*NAME)(__VA_ARGS__);
+
+DECLARE_CUDA_FUNCTION(cuCtxGetDevice, CUdevice *)
+DECLARE_CUDA_FUNCTION(cuDeviceGet, CUdevice *, int)
+DECLARE_CUDA_FUNCTION(cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
+DECLARE_CUDA_FUNCTION(cuDeviceGetCount, int *)
+DECLARE_CUDA_FUNCTION(cuFuncGetAttribute, int *, CUfunction_attribute,
+                      CUfunction)
+DECLARE_CUDA_FUNCTION(cuGetErrorString, CUresult, const char **)
+DECLARE_CUDA_FUNCTION(cuInit, unsigned)
+DECLARE_CUDA_FUNCTION(cuLaunchKernel, CUfunction, unsigned, unsigned, unsigned,
+                      unsigned, unsigned, unsigned, unsigned, CUstream, void **,
+                      void **)
+DECLARE_CUDA_FUNCTION(cuMemAlloc, CUdeviceptr *, size_t)
+DECLARE_CUDA_FUNCTION(cuMemcpyDtoDAsync, CUdeviceptr, CUdeviceptr, size_t,
+                      CUstream)
+DECLARE_CUDA_FUNCTION(cuMemcpyDtoH, void *, CUdeviceptr, size_t)
+DECLARE_CUDA_FUNCTION(cuMemcpyDtoHAsync, void *, CUdeviceptr, size_t, CUstream)
+DECLARE_CUDA_FUNCTION(cuMemcpyHtoD, CUdeviceptr, const void *, size_t)
+DECLARE_CUDA_FUNCTION(cuMemcpyHtoDAsync, CUdeviceptr, const void *, size_t,
+                      CUstream)
+DECLARE_CUDA_FUNCTION(cuMemFree, CUdeviceptr)
+DECLARE_CUDA_FUNCTION(cuModuleGetFunction, CUfunction *, CUmodule, const char *)
+DECLARE_CUDA_FUNCTION(cuModuleGetGlobal, CUdeviceptr *, size_t *, CUmodule,
+                      const char *)
+DECLARE_CUDA_FUNCTION(cuModuleUnload, CUmodule)
+DECLARE_CUDA_FUNCTION(cuStreamCreate, CUstream *, unsigned)
+DECLARE_CUDA_FUNCTION(cuStreamDestroy, CUstream)
+DECLARE_CUDA_FUNCTION(cuStreamSynchronize, CUstream)
+DECLARE_CUDA_FUNCTION(cuCtxSetCurrent, CUcontext)
+DECLARE_CUDA_FUNCTION(cuDevicePrimaryCtxRelease, CUdevice)
+DECLARE_CUDA_FUNCTION(cuDevicePrimaryCtxGetState, CUdevice, unsigned *, int *)
+DECLARE_CUDA_FUNCTION(cuDevicePrimaryCtxSetFlags, CUdevice, unsigned)
+DECLARE_CUDA_FUNCTION(cuDevicePrimaryCtxRetain, CUcontext *, CUdevice)
+DECLARE_CUDA_FUNCTION(cuModuleLoadDataEx, CUmodule *, const void *, unsigned,
+                      void *, void **);
+DECLARE_CUDA_FUNCTION(cuDeviceCanAccessPeer, int *, CUdevice, CUdevice)
+DECLARE_CUDA_FUNCTION(cuCtxEnablePeerAccess, CUcontext, unsigned)
+DECLARE_CUDA_FUNCTION(cuMemcpyPeerAsync, CUdeviceptr, CUcontext, CUdeviceptr,
+                      CUcontext, size_t, CUstream);
+
+#undef DECLARE_CUDA_FUNCTION
+
+static std::once_flag initFlag;
+static bool CUDAIsValid = false;
+
+void checkForCUDA() {
+  const char *CudaLib = "libcuda.so";
+  void *dynlib_handle = dlopen(CudaLib, RTLD_NOW);
+  if (!dynlib_handle) {
+    DP("Unable to load library '%s': %s!\n", CudaLib, dlerror());
+    CUDAIsValid = false;
+    return;
+  }
+
+  // If all runtime functions were found we are good to go.
+  CUDAIsValid = true;
+
+#define INIT_CUDA_FUNCTION(NAME)                                               \
+  NAME = reinterpret_cast<decltype(NAME)>(dlsym(dynlib_handle, #NAME));        \
+  if (NAME == nullptr) {                                                       \
+    DP("Unable to find '%s' in '%s'!\n", NAME, CudaLib);                       \
+    CUDAIsValid = false;                                                       \
+    return;                                                                    \
+  }
+
+  INIT_CUDA_FUNCTION(cuCtxGetDevice)
+  INIT_CUDA_FUNCTION(cuDeviceGet)
+  INIT_CUDA_FUNCTION(cuDeviceGetAttribute)
+  INIT_CUDA_FUNCTION(cuDeviceGetCount)
+  INIT_CUDA_FUNCTION(cuFuncGetAttribute)
+  INIT_CUDA_FUNCTION(cuGetErrorString)
+  INIT_CUDA_FUNCTION(cuInit)
+  INIT_CUDA_FUNCTION(cuLaunchKernel)
+  INIT_CUDA_FUNCTION(cuMemAlloc)
+  INIT_CUDA_FUNCTION(cuMemcpyDtoDAsync)
+  INIT_CUDA_FUNCTION(cuMemcpyDtoH)
+  INIT_CUDA_FUNCTION(cuMemcpyDtoHAsync)
+  INIT_CUDA_FUNCTION(cuMemcpyHtoD)
+  INIT_CUDA_FUNCTION(cuMemcpyHtoDAsync)
+  INIT_CUDA_FUNCTION(cuMemFree)
+  INIT_CUDA_FUNCTION(cuModuleGetFunction)
+  INIT_CUDA_FUNCTION(cuModuleGetGlobal)
+  INIT_CUDA_FUNCTION(cuModuleUnload)
+  INIT_CUDA_FUNCTION(cuStreamCreate)
+  INIT_CUDA_FUNCTION(cuStreamDestroy)
+  INIT_CUDA_FUNCTION(cuStreamSynchronize)
+  INIT_CUDA_FUNCTION(cuCtxSetCurrent)
+  INIT_CUDA_FUNCTION(cuDevicePrimaryCtxRelease)
+  INIT_CUDA_FUNCTION(cuDevicePrimaryCtxGetState)
+  INIT_CUDA_FUNCTION(cuDevicePrimaryCtxSetFlags)
+  INIT_CUDA_FUNCTION(cuDevicePrimaryCtxRetain)
+  INIT_CUDA_FUNCTION(cuModuleLoadDataEx)
+  INIT_CUDA_FUNCTION(cuDeviceCanAccessPeer)
+  INIT_CUDA_FUNCTION(cuCtxEnablePeerAccess)
+  INIT_CUDA_FUNCTION(cuMemcpyPeerAsync)
+
+#undef INIT_CUDA_FUNCTION
+}
+
+} // namespace
+
+///}
+
 // Utility for retrieving and printing CUDA error string.
 #ifdef OMPTARGET_DEBUG
 #define CUDA_ERR_STRING(err)                                                   \
@@ -292,8 +448,8 @@
   std::vector<DeviceDataTy> DeviceData;
   std::vector<CUmodule> Modules;
 
-  /// A class responsible for interacting with device native runtime library to
-  /// allocate and free memory.
+  /// A class responsible for interacting with device native runtime library
+  /// to allocate and free memory.
   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
     const int DeviceId;
     const std::vector<DeviceDataTy> &DeviceData;
@@ -334,7 +490,8 @@
   /// A vector of device allocators
   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
 
-  /// A vector of memory managers. Since the memory manager is non-copyable and
+  /// A vector of memory managers. Since the memory manager is non-copyable
+  /// and
   // non-removable, we wrap them into std::unique_ptr.
   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
 
@@ -537,7 +694,8 @@
       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
     } else {
-      DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
+      DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, "
+         "capping "
          "at the hard limit\n",
          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
@@ -646,8 +804,8 @@
     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
       if (!E->addr) {
         // We return nullptr when something like this happens, the host should
-        // have always something in the address to uniquely identify the target
-        // region.
+        // have always something in the address to uniquely identify the
+        // target region.
         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
         return nullptr;
       }
@@ -974,9 +1132,9 @@
           // integer
           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
         } else {
-          // If we reach this point, then we have a non-combined construct, i.e.
-          // `teams distribute` with a nested `parallel for` and each team is
-          // assigned one iteration of the `distribute` loop. E.g.:
+          // If we reach this point, then we have a non-combined construct,
+          // i.e. `teams distribute` with a nested `parallel for` and each
+          // team is assigned one iteration of the `distribute` loop. E.g.:
           //
           // #pragma omp target teams distribute
           // for(...loop_tripcount...) {
@@ -984,8 +1142,8 @@
           //   for(...) {}
           // }
           //
-          // Threads within a team will execute the iterations of the `parallel`
-          // loop.
+          // Threads within a team will execute the iterations of the
+          // `parallel` loop.
           CudaBlocksPerGrid = LoopTripCount;
         }
         DP("Using %d teams due to loop trip count %" PRIu32
@@ -1056,8 +1214,13 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-
 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+  // First try to load cuda runtime calls, if that fails we cannot deal with
+  // (any) image.
+  std::call_once(initFlag, checkForCUDA);
+  if (!CUDAIsValid)
+    return false;
+
   return elf_check_machine(image, /* EM_CUDA */ 190);
 }