Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -941,6 +941,10 @@ typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t); static CuMemAllocFcnTy *CuMemAllocFcnPtr; +typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t, + unsigned int); +static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr; + typedef CUresult CUDAAPI CuLaunchKernelFcnTy( CUfunction F, unsigned int GridDimX, unsigned int GridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY, @@ -1081,6 +1085,9 @@ CuMemAllocFcnPtr = (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2"); + CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA( + HandleCuda, "cuMemAllocManaged"); + CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2"); @@ -1445,7 +1452,7 @@ // If not, we pass it along to the underlying allocator. // This is a hack, and can be removed if the underlying issue is fixed. if (isManagedPtr(mem)) { - if (cudaFree(mem) != cudaSuccess) { + if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) { fprintf(stderr, "cudaFree failed.\n"); exit(-1); } @@ -1465,15 +1472,18 @@ fprintf(stderr, "cudaMallocManaged called with size 0. " "Promoting to size 1"); size = max(size, 1); - polly_initContextCUDA(); - dump_function(); - void *a; - if (cudaMallocManaged(&a, size, cudaMemAttachGlobal) != cudaSuccess) { + PollyGPUContext *_ = polly_initContextCUDA(); + assert(_ && "polly_initContextCUDA failed"); + + void *newMemPtr; + const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size, + CU_MEM_ATTACH_GLOBAL); + if (Res != CUDA_SUCCESS) { fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size); exit(-1); } - addManagedPtr(a); - return a; + addManagedPtr(newMemPtr); + return newMemPtr; } static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) { @@ -1531,6 +1541,7 @@ } static void freeContextCUDA(PollyGPUContext *Context) { + dump_function(); CUDAContext *Ctx = (CUDAContext *)Context->Context;