diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include @@ -27,6 +27,90 @@ #include "MemoryManager.h" +/// Forward declare cuda functions to avoid cuda.h dependence. +/// +///{ + +typedef int CUdevice; +typedef uintptr_t CUdeviceptr; +typedef struct CUmod_st *CUmodule; +typedef struct CUctx_st *CUcontext; +typedef struct CUfunc_st *CUfunction; +typedef struct CUstream_st *CUstream; + +typedef enum cudaError_enum { + CUDA_SUCCESS = 0, + CUDA_ERROR_INVALID_VALUE = 1, +} CUresult; + +typedef enum CUstream_flags_enum { + CU_STREAM_DEFAULT = 0x0, + CU_STREAM_NON_BLOCKING = 0x1, +} CUstream_flags; + +typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, +} CUdevice_attribute; + +typedef enum CUfunction_attribute_enum { + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, +} CUfunction_attribute; + +typedef enum CUctx_flags_enum { + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, + CU_CTX_SCHED_MASK = 0x07, +} CUctx_flags; + +CUresult cuCtxGetDevice(CUdevice *); +CUresult cuDeviceGet(CUdevice *, int); +CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice); +CUresult cuDeviceGetCount(int *); +CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction); +CUresult cuGetErrorString(CUresult, const char **); +CUresult cuInit(unsigned); +CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned, + unsigned, unsigned, unsigned, CUstream, void **, + void **); +#define cuMemAlloc cuMemAlloc_v2 +CUresult cuMemAlloc(CUdeviceptr *, size_t); +#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2 +CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream); +#define cuMemcpyDtoH cuMemcpyDtoH_v2 +CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t); +#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2 +CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream); +#define cuMemcpyHtoD cuMemcpyHtoD_v2 +CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t); +#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2 +CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); +#define cuMemFree cuMemFree_v2 +CUresult cuMemFree(CUdeviceptr); +CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *); +#define cuModuleGetGlobal cuModuleGetGlobal_v2 +CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *); +CUresult cuModuleUnload(CUmodule); +CUresult cuStreamCreate(CUstream *, unsigned); +#define cuStreamDestroy cuStreamDestroy_v2 +CUresult cuStreamDestroy(CUstream); +CUresult cuStreamSynchronize(CUstream); +CUresult cuCtxSetCurrent(CUcontext); +#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 +CUresult cuDevicePrimaryCtxRelease(CUdevice); +CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *); +#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 +CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned); +CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice); +CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *, + void **); +CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice); +CUresult cuCtxEnablePeerAccess(CUcontext, unsigned); +CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, + size_t, CUstream); + +///} + // Utility for retrieving and printing CUDA error string. #ifdef OMPTARGET_DEBUG #define CUDA_ERR_STRING(err) \