Index: tools/GPURuntime/GPUJIT.h =================================================================== --- tools/GPURuntime/GPUJIT.h +++ tools/GPURuntime/GPUJIT.h @@ -13,6 +13,7 @@ #ifndef GPUJIT_H_ #define GPUJIT_H_ +#include "stddef.h" /* * The following demostrates how we can use the GPURuntime library to @@ -110,4 +111,13 @@ void **Parameters); void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation); void polly_freeContext(PollyGPUContext *Context); + +// Note that polly_{malloc/free}Managed are currently not used by Polly. +// We use them in COSMO by replacing all malloc with polly_mallocManaged and all +// frees with cudaFree, so we can get managed memory "automatically". +// Needless to say, this is a hack. +// Please make sure that this code is not present in Polly when 2018 rolls in. +// If this is still present, ping Siddharth Bhat +void *polly_mallocManaged(size_t size); +void polly_freeManaged(void *mem); #endif /* GPUJIT_H_ */ Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -973,6 +973,9 @@ typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice); static CuCtxCreateFcnTy *CuCtxCreateFcnPtr; +typedef CUresult CUDAAPI CuCtxGetCurrentFcnTy(CUcontext *); +static CuCtxGetCurrentFcnTy *CuCtxGetCurrentFcnPtr; + typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int); static CuDeviceGetFcnTy *CuDeviceGetFcnPtr; @@ -1105,6 +1108,9 @@ CuCtxCreateFcnPtr = (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2"); + CuCtxGetCurrentFcnPtr = + (CuCtxGetCurrentFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxGetCurrent"); + CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA( HandleCuda, "cuModuleLoadDataEx"); @@ -1194,7 +1200,33 @@ fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n"); exit(-1); } - CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, Device); + + // In cases where managed memory is used, it is quite likely that + // `cudaMallocManaged` / `polly_mallocManaged` was called before + // `polly_initContext` was called. + // + // If `polly_initContext` calls `CuCtxCreate` when there already was a + // pre-existing context created by the runtime API, this causes code running + // on P100 to hang. So, we query for a pre-existing context to try and use. + // If there is no pre-existing context, we create a new context + + // The possible pre-existing context from previous runtime API calls. + CUcontext MaybeRuntimeAPIContext; + if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) { + fprintf(stderr, "cuCtxGetCurrent failed.\n"); + exit(-1); + } + + // There was no previous context, initialise it. + if (MaybeRuntimeAPIContext == NULL) { + if (CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, + Device) != CUDA_SUCCESS) { + fprintf(stderr, "cuCtxCreateFcnPtr failed.\n"); + exit(-1); + } + } else { + ((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext; + } if (CacheMode) CurrentContext = Context; @@ -1377,6 +1409,59 @@ } } +// Maximum number of managed memory pointers. +#define MAX_POINTERS 4000 +// For the rationale behing a list of free pointers, see `polly_freeManaged`. +void *g_managedptrs[MAX_POINTERS]; +int g_nmanagedptrs = 0; + +// Add a pointer as being allocated by cuMallocManaged +void addManagedPtr(void *mem) { + assert(g_nmanagedptrs < MAX_POINTERS && "We have hit the maximum number of " + "managed pointers allowed. Increase " + "MAX_POINTERS"); + g_managedptrs[g_nmanagedptrs++] = mem; +} + +int isManagedPtr(void *mem) { + for (int i = 0; i < g_nmanagedptrs; i++) { + if (g_managedptrs[i] == mem) + return 1; + } + return 0; +} + +void polly_freeManaged(void *mem) { + dump_function(); + + // Something extremely weird goes on in the COSMO code: we get calls + // to free memory that have not been `malloc`d. Since we replace all free + // with `cudaFreee`, sometimes we get calls to free to memory we have not + // even mallocd. + // So, keep a free list and check if this is one of "our" pointers. If it is, + // call `cudaFree`. If not, pass it along to the underlying allocator. + if (isManagedPtr(mem)) { + if (cudaFree(mem) != cudaSuccess) { + fprintf(stderr, "cudaFree failed.\n"); + exit(-1); + } + return; + } else { + free(mem); + } +} + +void *polly_mallocManaged(size_t size) { + dump_function(); + void *a; + if (cudaMallocManaged(&a, size, cudaMemAttachGlobal) != cudaSuccess) { + fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size); + exit(-1); + } + addManagedPtr(a); + return a; +} + static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) { dump_function(); CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;