Changeset View
Changeset View
Standalone View
Standalone View
tools/GPURuntime/GPUJIT.c
Show First 20 Lines • Show All 935 Lines • ▼ Show 20 Lines | |||||
/* Dynamic library handles for the CUDA and CUDA runtime library. */ | /* Dynamic library handles for the CUDA and CUDA runtime library. */ | ||||
static void *HandleCuda; | static void *HandleCuda; | ||||
static void *HandleCudaRT; | static void *HandleCudaRT; | ||||
/* Type-defines of function pointer to CUDA driver APIs. */ | /* Type-defines of function pointer to CUDA driver APIs. */ | ||||
typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t); | typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t); | ||||
static CuMemAllocFcnTy *CuMemAllocFcnPtr; | static CuMemAllocFcnTy *CuMemAllocFcnPtr; | ||||
typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t, | |||||
unsigned int); | |||||
static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr; | |||||
typedef CUresult CUDAAPI CuLaunchKernelFcnTy( | typedef CUresult CUDAAPI CuLaunchKernelFcnTy( | ||||
CUfunction F, unsigned int GridDimX, unsigned int GridDimY, | CUfunction F, unsigned int GridDimX, unsigned int GridDimY, | ||||
unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY, | unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY, | ||||
unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream, | unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream, | ||||
void **KernelParams, void **Extra); | void **KernelParams, void **Extra); | ||||
static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr; | static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr; | ||||
typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t); | typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t); | ||||
▲ Show 20 Lines • Show All 124 Lines • ▼ Show 20 Lines | if (initialDeviceAPILibrariesCUDA() == 0) | ||||
return 0; | return 0; | ||||
CuLaunchKernelFcnPtr = | CuLaunchKernelFcnPtr = | ||||
(CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel"); | (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel"); | ||||
CuMemAllocFcnPtr = | CuMemAllocFcnPtr = | ||||
(CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2"); | (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2"); | ||||
CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA( | |||||
HandleCuda, "cuMemAllocManaged"); | |||||
CuMemFreeFcnPtr = | CuMemFreeFcnPtr = | ||||
(CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2"); | (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2"); | ||||
CuMemcpyDtoHFcnPtr = | CuMemcpyDtoHFcnPtr = | ||||
(CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2"); | (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2"); | ||||
CuMemcpyHtoDFcnPtr = | CuMemcpyHtoDFcnPtr = | ||||
(CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2"); | (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2"); | ||||
▲ Show 20 Lines • Show All 348 Lines • ▼ Show 20 Lines | void polly_freeManaged(void *mem) { | ||||
// calls in the original source than `malloc` calls. Hence, replacing all | // calls in the original source than `malloc` calls. Hence, replacing all | ||||
// `free`s with `cudaFree` does not work, since we would try to free | // `free`s with `cudaFree` does not work, since we would try to free | ||||
// 'illegal' memory. | // 'illegal' memory. | ||||
// As a quick fix, we keep a free list and check if `mem` is a managed memory | // As a quick fix, we keep a free list and check if `mem` is a managed memory | ||||
// pointer. If it is, we call `cudaFree`. | // pointer. If it is, we call `cudaFree`. | ||||
// If not, we pass it along to the underlying allocator. | // If not, we pass it along to the underlying allocator. | ||||
// This is a hack, and can be removed if the underlying issue is fixed. | // This is a hack, and can be removed if the underlying issue is fixed. | ||||
if (isManagedPtr(mem)) { | if (isManagedPtr(mem)) { | ||||
if (cudaFree(mem) != cudaSuccess) { | if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) { | ||||
fprintf(stderr, "cudaFree failed.\n"); | fprintf(stderr, "cudaFree failed.\n"); | ||||
exit(-1); | exit(-1); | ||||
} | } | ||||
return; | return; | ||||
} else { | } else { | ||||
free(mem); | free(mem); | ||||
} | } | ||||
} | } | ||||
void *polly_mallocManaged(size_t size) { | void *polly_mallocManaged(size_t size) { | ||||
// Note: [Size 0 allocations] | // Note: [Size 0 allocations] | ||||
// Sometimes, some runtime computation of size could create a size of 0 | // Sometimes, some runtime computation of size could create a size of 0 | ||||
// for an allocation. In these cases, we do not wish to fail. | // for an allocation. In these cases, we do not wish to fail. | ||||
// The CUDA API fails on size 0 allocations. | // The CUDA API fails on size 0 allocations. | ||||
// So, we allocate size a minimum of size 1. | // So, we allocate size a minimum of size 1. | ||||
if (!size && DebugMode) | if (!size && DebugMode) | ||||
fprintf(stderr, "cudaMallocManaged called with size 0. " | fprintf(stderr, "cudaMallocManaged called with size 0. " | ||||
"Promoting to size 1"); | "Promoting to size 1"); | ||||
size = max(size, 1); | size = max(size, 1); | ||||
polly_initContextCUDA(); | PollyGPUContext *_ = polly_initContextCUDA(); | ||||
dump_function(); | assert(_ && "polly_initContextCUDA failed"); | ||||
void *a; | |||||
if (cudaMallocManaged(&a, size, cudaMemAttachGlobal) != cudaSuccess) { | void *newMemPtr; | ||||
const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size, | |||||
CU_MEM_ATTACH_GLOBAL); | |||||
if (Res != CUDA_SUCCESS) { | |||||
fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size); | fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size); | ||||
exit(-1); | exit(-1); | ||||
} | } | ||||
addManagedPtr(a); | addManagedPtr(newMemPtr); | ||||
return a; | return newMemPtr; | ||||
} | } | ||||
static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) { | static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) { | ||||
dump_function(); | dump_function(); | ||||
CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; | CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; | ||||
CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda); | CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda); | ||||
free(DevPtr); | free(DevPtr); | ||||
free(Allocation); | free(Allocation); | ||||
▲ Show 20 Lines • Show All 328 Lines • Show Last 20 Lines |