diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt --- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt +++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt @@ -15,12 +15,6 @@ elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND) libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.") return() -elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND) - libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.") - return() -elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND) - libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.") - return() endif() libomptarget_say("Building CUDA offloading plugin.") @@ -28,7 +22,6 @@ # Define the suffix for the runtime messaging dumps. add_definitions(-DTARGET_NAME=CUDA) -include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}) add_library(omptarget.rtl.cuda SHARED src/rtl.cpp) @@ -39,7 +32,7 @@ target_link_libraries(omptarget.rtl.cuda elf_common MemoryManager - ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES} + ${CMAKE_DL_LIBS} ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" "-Wl,-z,defs") diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -12,7 +12,8 @@ #include #include -#include +#include +#include #include #include #include @@ -27,6 +28,161 @@ #include "MemoryManager.h" +/// To avoid a cuda.h and linktime libcuda.so dependence we declare used values +/// outselves and load the runtime functions we need at runtime. +/// +///{ + +namespace { + +typedef int CUdevice; +typedef uintptr_t CUdeviceptr; +typedef struct CUmod_st *CUmodule; +typedef struct CUctx_st *CUcontext; +typedef struct CUfunc_st *CUfunction; +typedef struct CUstream_st *CUstream; + +typedef enum cudaError_enum { + CUDA_SUCCESS = 0, + CUDA_ERROR_INVALID_VALUE = 1, +} CUresult; + +typedef enum CUstream_flags_enum { + CU_STREAM_DEFAULT = 0x0, + CU_STREAM_NON_BLOCKING = 0x1, +} CUstream_flags; + +typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, +} CUdevice_attribute; + +typedef enum CUfunction_attribute_enum { + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, +} CUfunction_attribute; + +typedef enum CUctx_flags_enum { + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, + CU_CTX_SCHED_MASK = 0x07, +} CUctx_flags; + +#define cuMemFree cuMemFree_v2 +#define cuMemAlloc cuMemAlloc_v2 +#define cuMemcpyDtoH cuMemcpyDtoH_v2 +#define cuMemcpyHtoD cuMemcpyHtoD_v2 +#define cuStreamDestroy cuStreamDestroy_v2 +#define cuModuleGetGlobal cuModuleGetGlobal_v2 +#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2 +#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2 +#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2 +#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 +#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 + +#define DECLARE_CUDA_FUNCTION(NAME, ...) CUresult (*NAME)(__VA_ARGS__); + +DECLARE_CUDA_FUNCTION(cuCtxGetDevice, CUdevice *) +DECLARE_CUDA_FUNCTION(cuDeviceGet, CUdevice *, int) +DECLARE_CUDA_FUNCTION(cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice) +DECLARE_CUDA_FUNCTION(cuDeviceGetCount, int *) +DECLARE_CUDA_FUNCTION(cuFuncGetAttribute, int *, CUfunction_attribute, + CUfunction) +DECLARE_CUDA_FUNCTION(cuGetErrorString, CUresult, const char **) +DECLARE_CUDA_FUNCTION(cuInit, unsigned) +DECLARE_CUDA_FUNCTION(cuLaunchKernel, CUfunction, unsigned, unsigned, unsigned, + unsigned, unsigned, unsigned, unsigned, CUstream, void **, + void **) +DECLARE_CUDA_FUNCTION(cuMemAlloc, CUdeviceptr *, size_t) +DECLARE_CUDA_FUNCTION(cuMemcpyDtoDAsync, CUdeviceptr, CUdeviceptr, size_t, + CUstream) +DECLARE_CUDA_FUNCTION(cuMemcpyDtoH, void *, CUdeviceptr, size_t) +DECLARE_CUDA_FUNCTION(cuMemcpyDtoHAsync, void *, CUdeviceptr, size_t, CUstream) +DECLARE_CUDA_FUNCTION(cuMemcpyHtoD, CUdeviceptr, const void *, size_t) +DECLARE_CUDA_FUNCTION(cuMemcpyHtoDAsync, CUdeviceptr, const void *, size_t, + CUstream) +DECLARE_CUDA_FUNCTION(cuMemFree, CUdeviceptr) +DECLARE_CUDA_FUNCTION(cuModuleGetFunction, CUfunction *, CUmodule, const char *) +DECLARE_CUDA_FUNCTION(cuModuleGetGlobal, CUdeviceptr *, size_t *, CUmodule, + const char *) +DECLARE_CUDA_FUNCTION(cuModuleUnload, CUmodule) +DECLARE_CUDA_FUNCTION(cuStreamCreate, CUstream *, unsigned) +DECLARE_CUDA_FUNCTION(cuStreamDestroy, CUstream) +DECLARE_CUDA_FUNCTION(cuStreamSynchronize, CUstream) +DECLARE_CUDA_FUNCTION(cuCtxSetCurrent, CUcontext) +DECLARE_CUDA_FUNCTION(cuDevicePrimaryCtxRelease, CUdevice) +DECLARE_CUDA_FUNCTION(cuDevicePrimaryCtxGetState, CUdevice, unsigned *, int *) +DECLARE_CUDA_FUNCTION(cuDevicePrimaryCtxSetFlags, CUdevice, unsigned) +DECLARE_CUDA_FUNCTION(cuDevicePrimaryCtxRetain, CUcontext *, CUdevice) +DECLARE_CUDA_FUNCTION(cuModuleLoadDataEx, CUmodule *, const void *, unsigned, + void *, void **); +DECLARE_CUDA_FUNCTION(cuDeviceCanAccessPeer, int *, CUdevice, CUdevice) +DECLARE_CUDA_FUNCTION(cuCtxEnablePeerAccess, CUcontext, unsigned) +DECLARE_CUDA_FUNCTION(cuMemcpyPeerAsync, CUdeviceptr, CUcontext, CUdeviceptr, + CUcontext, size_t, CUstream); + +#undef DECLARE_CUDA_FUNCTION + +static std::once_flag initFlag; +static bool CUDAIsValid = false; + +void checkForCUDA() { + const char *CudaLib = "libcuda.so"; + void *dynlib_handle = dlopen(CudaLib, RTLD_NOW); + if (!dynlib_handle) { + DP("Unable to load library '%s': %s!\n", CudaLib, dlerror()); + CUDAIsValid = false; + return; + } + + // If all runtime functions were found we are good to go. + CUDAIsValid = true; + +#define INIT_CUDA_FUNCTION(NAME) \ + NAME = reinterpret_cast(dlsym(dynlib_handle, #NAME)); \ + if (NAME == nullptr) { \ + DP("Unable to find '%s' in '%s'!\n", NAME, CudaLib); \ + CUDAIsValid = false; \ + return; \ + } + + INIT_CUDA_FUNCTION(cuCtxGetDevice) + INIT_CUDA_FUNCTION(cuDeviceGet) + INIT_CUDA_FUNCTION(cuDeviceGetAttribute) + INIT_CUDA_FUNCTION(cuDeviceGetCount) + INIT_CUDA_FUNCTION(cuFuncGetAttribute) + INIT_CUDA_FUNCTION(cuGetErrorString) + INIT_CUDA_FUNCTION(cuInit) + INIT_CUDA_FUNCTION(cuLaunchKernel) + INIT_CUDA_FUNCTION(cuMemAlloc) + INIT_CUDA_FUNCTION(cuMemcpyDtoDAsync) + INIT_CUDA_FUNCTION(cuMemcpyDtoH) + INIT_CUDA_FUNCTION(cuMemcpyDtoHAsync) + INIT_CUDA_FUNCTION(cuMemcpyHtoD) + INIT_CUDA_FUNCTION(cuMemcpyHtoDAsync) + INIT_CUDA_FUNCTION(cuMemFree) + INIT_CUDA_FUNCTION(cuModuleGetFunction) + INIT_CUDA_FUNCTION(cuModuleGetGlobal) + INIT_CUDA_FUNCTION(cuModuleUnload) + INIT_CUDA_FUNCTION(cuStreamCreate) + INIT_CUDA_FUNCTION(cuStreamDestroy) + INIT_CUDA_FUNCTION(cuStreamSynchronize) + INIT_CUDA_FUNCTION(cuCtxSetCurrent) + INIT_CUDA_FUNCTION(cuDevicePrimaryCtxRelease) + INIT_CUDA_FUNCTION(cuDevicePrimaryCtxGetState) + INIT_CUDA_FUNCTION(cuDevicePrimaryCtxSetFlags) + INIT_CUDA_FUNCTION(cuDevicePrimaryCtxRetain) + INIT_CUDA_FUNCTION(cuModuleLoadDataEx) + INIT_CUDA_FUNCTION(cuDeviceCanAccessPeer) + INIT_CUDA_FUNCTION(cuCtxEnablePeerAccess) + INIT_CUDA_FUNCTION(cuMemcpyPeerAsync) + +#undef INIT_CUDA_FUNCTION +} + +} // namespace + +///} + // Utility for retrieving and printing CUDA error string. #ifdef OMPTARGET_DEBUG #define CUDA_ERR_STRING(err) \ @@ -292,8 +448,8 @@ std::vector DeviceData; std::vector Modules; - /// A class responsible for interacting with device native runtime library to - /// allocate and free memory. + /// A class responsible for interacting with device native runtime library + /// to allocate and free memory. class CUDADeviceAllocatorTy : public DeviceAllocatorTy { const int DeviceId; const std::vector &DeviceData; @@ -334,7 +490,8 @@ /// A vector of device allocators std::vector DeviceAllocators; - /// A vector of memory managers. Since the memory manager is non-copyable and + /// A vector of memory managers. Since the memory manager is non-copyable + /// and // non-removable, we wrap them into std::unique_ptr. std::vector> MemoryManagers; @@ -537,7 +694,8 @@ DP("Using %d CUDA blocks per grid\n", MaxGridDimX); DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; } else { - DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " + DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, " + "capping " "at the hard limit\n", MaxGridDimX, DeviceRTLTy::HardTeamLimit); DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; @@ -646,8 +804,8 @@ for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { if (!E->addr) { // We return nullptr when something like this happens, the host should - // have always something in the address to uniquely identify the target - // region. + // have always something in the address to uniquely identify the + // target region. DP("Invalid binary: host entry '' (size = %zd)...\n", E->size); return nullptr; } @@ -974,9 +1132,9 @@ // integer CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; } else { - // If we reach this point, then we have a non-combined construct, i.e. - // `teams distribute` with a nested `parallel for` and each team is - // assigned one iteration of the `distribute` loop. E.g.: + // If we reach this point, then we have a non-combined construct, + // i.e. `teams distribute` with a nested `parallel for` and each + // team is assigned one iteration of the `distribute` loop. E.g.: // // #pragma omp target teams distribute // for(...loop_tripcount...) { @@ -984,8 +1142,8 @@ // for(...) {} // } // - // Threads within a team will execute the iterations of the `parallel` - // loop. + // Threads within a team will execute the iterations of the + // `parallel` loop. CudaBlocksPerGrid = LoopTripCount; } DP("Using %d teams due to loop trip count %" PRIu32 @@ -1056,8 +1214,13 @@ #ifdef __cplusplus extern "C" { #endif - int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { + // First try to load cuda runtime calls, if that fails we cannot deal with + // (any) image. + std::call_once(initFlag, checkForCUDA); + if (!CUDAIsValid) + return false; + return elf_check_machine(image, /* EM_CUDA */ 190); }