Index: CMakeLists.txt =================================================================== --- CMakeLists.txt +++ CMakeLists.txt @@ -152,9 +152,10 @@ option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF) if (POLLY_ENABLE_GPGPU_CODEGEN) - # Do not require CUDA, as GPU code generation test cases can be run without - # a cuda library. + # Do not require CUDA/OpenCL, as GPU code generation test cases can be run + # without a CUDA/OpenCL library. FIND_PACKAGE(CUDA) + FIND_PACKAGE(OpenCL) set(GPU_CODEGEN TRUE) else(POLLY_ENABLE_GPGPU_CODEGEN) set(GPU_CODEGEN FALSE) @@ -162,9 +163,10 @@ # Support GPGPU code generation if the library is available. -if (CUDALIB_FOUND) +if (CUDALIB_FOUND AND OpenCL_FOUND) INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} ) -endif(CUDALIB_FOUND) + INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} ) +endif(CUDALIB_FOUND AND OpenCL_FOUND) option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON) if (NOT POLLY_BUNDLED_ISL) Index: include/polly/LinkAllPasses.h =================================================================== --- include/polly/LinkAllPasses.h +++ include/polly/LinkAllPasses.h @@ -48,7 +48,7 @@ llvm::Pass *createIslAstInfoPass(); llvm::Pass *createCodeGenerationPass(); #ifdef GPU_CODEGEN -llvm::Pass *createPPCGCodeGenerationPass(); +llvm::Pass *createPPCGCodeGenerationPass(int Runtime); #endif llvm::Pass *createIslScheduleOptimizerPass(); llvm::Pass *createFlattenSchedulePass(); @@ -83,7 +83,7 @@ polly::createIslAstInfoPass(); polly::createCodeGenerationPass(); #ifdef GPU_CODEGEN - polly::createPPCGCodeGenerationPass(); + polly::createPPCGCodeGenerationPass(0); #endif polly::createIslScheduleOptimizerPass(); polly::createFlattenSchedulePass(); Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -53,6 +53,13 @@ #define DEBUG_TYPE "polly-codegen-ppcg" +/// The GPU Runtime implementation to use. +enum GPURuntimeT { + GPU_RUNTIME_NONE, + GPU_RUNTIME_CUDA, + GPU_RUNTIME_OPENCL +}; + static cl::opt DumpSchedule("polly-acc-dump-schedule", cl::desc("Dump the computed GPU Schedule"), cl::Hidden, cl::init(false), cl::ZeroOrMore, @@ -146,9 +153,9 @@ GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, Scop &S, BasicBlock *StartBlock, - gpu_prog *Prog) + gpu_prog *Prog, enum GPURuntimeT GPURuntime) : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), - Prog(Prog) { + Prog(Prog), Runtime(GPURuntime) { getExprBuilder().setIDToSAI(&IDToSAI); } @@ -194,6 +201,9 @@ /// The GPU program we generate code for. gpu_prog *Prog; + /// The GPU Runtime implementation to use (OpenCL or CUDA). + enum GPURuntimeT Runtime; + /// Class to free isl_ids. class IslIdDeleter { public: @@ -700,7 +710,14 @@ } Value *GPUNodeBuilder::createCallInitContext() { - const char *Name = "polly_initContext"; + assert(Runtime != GPU_RUNTIME_NONE && "GPU Runtime not set"); + const char *Name; + + if (Runtime == GPU_RUNTIME_CUDA) + Name = "polly_initContextCUDA"; + else if (Runtime == GPU_RUNTIME_OPENCL) + Name = "polly_initContextCL"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -1543,10 +1560,16 @@ void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, SetVector &SubtreeValues) { + assert(Runtime != GPU_RUNTIME_NONE && "GPU Runtime not set"); std::string Identifier = "kernel_" + std::to_string(Kernel->id); GPUModule.reset(new Module(Identifier, Builder.getContext())); - GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); + + if (Runtime == GPU_RUNTIME_CUDA) + GPUModule->setTargetTriple(Triple::normalize("nvptx64-cuda")); + else if (Runtime == GPU_RUNTIME_OPENCL) + GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvcl")); + GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); @@ -1568,7 +1591,14 @@ } std::string GPUNodeBuilder::createKernelASM() { - llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); + assert(Runtime != GPU_RUNTIME_NONE && "GPU Runtime not set"); + llvm::Triple GPUTriple; + + if (Runtime == GPU_RUNTIME_CUDA) + GPUTriple = llvm::Triple(Triple::normalize("nvptx64-cuda")); + else if (Runtime == GPU_RUNTIME_OPENCL) + GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvcl")); + std::string ErrMsg; auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); @@ -1633,6 +1663,8 @@ public: static char ID; + enum GPURuntimeT Runtime = GPU_RUNTIME_NONE; + /// The scop that is currently processed. Scop *S; @@ -2416,7 +2448,7 @@ executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, - StartBlock, Prog); + StartBlock, Prog, Runtime); // TODO: Handle LICM auto SplitBlock = StartBlock->getSinglePredecessor(); @@ -2504,7 +2536,18 @@ char PPCGCodeGeneration::ID = 1; -Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } +Pass *polly::createPPCGCodeGenerationPass(int Runtime) { + PPCGCodeGeneration *generator = new PPCGCodeGeneration(); + switch (Runtime) { + case 1: + generator->Runtime = GPU_RUNTIME_CUDA; + break; + case 2: + generator->Runtime = GPU_RUNTIME_OPENCL; + break; + } + return generator; +} INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", "Polly - Apply PPCG translation to SCOP", false, false) Index: lib/Support/RegisterPasses.cpp =================================================================== --- lib/Support/RegisterPasses.cpp +++ lib/Support/RegisterPasses.cpp @@ -91,15 +91,25 @@ cl::Hidden, cl::init(CODEGEN_FULL), cl::ZeroOrMore, cl::cat(PollyCategory)); enum TargetChoice { TARGET_CPU, TARGET_GPU }; -static cl::opt - Target("polly-target", cl::desc("The hardware to target"), - cl::values(clEnumValN(TARGET_CPU, "cpu", "generate CPU code") +static cl::opt Target( + "polly-target", cl::desc("The hardware to target"), + cl::values(clEnumValN(TARGET_CPU, "cpu", "generate CPU code") #ifdef GPU_CODEGEN - , - clEnumValN(TARGET_GPU, "gpu", "generate GPU code") + , + clEnumValN(TARGET_GPU, "gpu", "generate GPU code") +#endif + ), + cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory)); + +#ifdef GPU_CODEGEN +enum GPURuntimeChoice { GPU_RUNTIME_CUDA, GPU_RUNTIME_OPENCL }; +static cl::opt GPURuntime( + "polly-gpu-runtime", cl::desc("The GPU Runtime API to target"), + cl::values(clEnumValN(GPU_RUNTIME_CUDA, "cuda", "use the CUDA Runtime API"), + clEnumValN(GPU_RUNTIME_OPENCL, "opencl", + "use the OpenCL Runtime API")), + cl::init(GPU_RUNTIME_OPENCL), cl::ZeroOrMore, cl::cat(PollyCategory)); #endif - ), - cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory)); VectorizerChoice polly::PollyVectorizerChoice; static cl::opt Vectorizer( @@ -309,7 +319,16 @@ if (Target == TARGET_GPU) { #ifdef GPU_CODEGEN - PM.add(polly::createPPCGCodeGenerationPass()); + switch (GPURuntime) { + case GPU_RUNTIME_CUDA: + static const int UseCUDARuntime = 1; + PM.add(polly::createPPCGCodeGenerationPass(UseCUDARuntime)); + break; + case GPU_RUNTIME_OPENCL: + static const int UseOpenCLRuntime = 2; + PM.add(polly::createPPCGCodeGenerationPass(UseOpenCLRuntime)); + break; + } #endif } else { switch (CodeGeneration) { Index: tools/CMakeLists.txt =================================================================== --- tools/CMakeLists.txt +++ tools/CMakeLists.txt @@ -1,5 +1,5 @@ -if (CUDALIB_FOUND) +if (CUDALIB_FOUND AND OpenCL_FOUND) add_subdirectory(GPURuntime) -endif (CUDALIB_FOUND) +endif (CUDALIB_FOUND AND OpenCL_FOUND) set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE) Index: tools/GPURuntime/GPUJIT.h =================================================================== --- tools/GPURuntime/GPUJIT.h +++ tools/GPURuntime/GPUJIT.h @@ -76,11 +76,26 @@ * */ +typedef enum PollyGPURuntimeT { + RUNTIME_NONE, + RUNTIME_CUDA, + RUNTIME_CL +} PollyGPURuntime; + typedef struct PollyGPUContextT PollyGPUContext; typedef struct PollyGPUFunctionT PollyGPUFunction; typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr; -PollyGPUContext *polly_initContext(); +typedef struct OpenCLContextT OpenCLContext; +typedef struct OpenCLKernelT OpenCLKernel; +typedef struct OpenCLDevicePtrT OpenCLDevicePtr; + +typedef struct CUDAContextT CUDAContext; +typedef struct CUDAKernelT CUDAKernel; +typedef struct CUDADevicePtrT CUDADevicePtr; + +PollyGPUContext *polly_initContextCUDA(); +PollyGPUContext *polly_initContextCL(); PollyGPUFunction *polly_getKernel(const char *PTXBuffer, const char *KernelName); void polly_freeKernel(PollyGPUFunction *Kernel); Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -14,6 +14,13 @@ #include "GPUJIT.h" #include #include + +#ifdef __APPLE__ +#include +#else +#include +#endif + #include #include #include @@ -22,6 +29,8 @@ static int DebugMode; static int CacheMode; +static PollyGPURuntime Runtime = RUNTIME_NONE; + static void debug_print(const char *format, ...) { if (!DebugMode) return; @@ -33,18 +42,907 @@ } #define dump_function() debug_print("-> %s\n", __func__) -/* Define Polly's GPGPU data types. */ +#define KERNEL_CACHE_SIZE 10 + +static void err_runtime() { + fprintf(stdout, "Runtime not correctly initialized.\n"); + exit(-1); +} + struct PollyGPUContextT { - CUcontext Cuda; + void *Context; }; struct PollyGPUFunctionT { + void *Kernel; +}; + +struct PollyGPUDevicePtrT { + void *DevicePtr; +}; + +/******************************************************************************/ +/* OpenCL */ +/******************************************************************************/ + +struct OpenCLContextT { + cl_context context; + cl_command_queue command_queue; +}; + +struct OpenCLKernelT { + cl_kernel kernel; + cl_program program; + const char *PTXString; +}; + +struct OpenCLDevicePtrT { + cl_mem mem; +}; + +/* Dynamic library handles for the OpenCL runtime library. */ +static void *HandleOpenCL; + +/* Type-defines of function pointer to OpenCL Runtime API. */ +typedef cl_int clGetPlatformIDsFcnTy(cl_uint num_entries, + cl_platform_id *platforms, + cl_uint *num_platforms); +static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr; + +typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id platform, + cl_device_type device_type, + cl_uint num_entries, cl_device_id *devices, + cl_uint *num_devices); +static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr; + +typedef cl_int clGetDeviceInfoFcnTy(cl_device_id device, + cl_device_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); +static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr; + +typedef cl_int clGetKernelInfoFcnTy(cl_kernel kernel, cl_kernel_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret); +static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr; + +typedef cl_context clCreateContextFcnTy( + const cl_context_properties *properties, cl_uint num_devices, + const cl_device_id *devices, + void CL_CALLBACK *pfn_notify(const char *errinfo, const void *private_info, + size_t cb, void *user_data), + void *user_data, cl_int *errcode_ret); +static clCreateContextFcnTy *clCreateContextFcnPtr; + +typedef cl_command_queue +clCreateCommandQueueFcnTy(cl_context context, cl_device_id device, + cl_command_queue_properties properties, + cl_int *errcode_ret); +static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr; + +typedef cl_mem clCreateBufferFcnTy(cl_context context, cl_mem_flags flags, + size_t size, void *host_ptr, + cl_int *errcode_ret); +static clCreateBufferFcnTy *clCreateBufferFcnPtr; + +typedef cl_int +clEnqueueWriteBufferFcnTy(cl_command_queue command_queue, cl_mem buffer, + cl_bool blocking_write, size_t offset, size_t size, + const void *ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); +static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr; + +typedef cl_program clCreateProgramWithBinaryFcnTy( + cl_context context, cl_uint num_devices, const cl_device_id *device_list, + const size_t *lengths, const unsigned char **binaries, + cl_int *binary_status, cl_int *errcode_ret); +static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr; + +typedef cl_int clBuildProgramFcnTy( + cl_program program, cl_uint num_devices, const cl_device_id *device_list, + const char *options, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data); +static clBuildProgramFcnTy *clBuildProgramFcnPtr; + +typedef cl_kernel clCreateKernelFcnTy(cl_program program, + const char *kernel_name, + cl_int *errcode_ret); +static clCreateKernelFcnTy *clCreateKernelFcnPtr; + +typedef cl_int clSetKernelArgFcnTy(cl_kernel kernel, cl_uint arg_index, + size_t arg_size, const void *arg_value); +static clSetKernelArgFcnTy *clSetKernelArgFcnPtr; + +typedef cl_int clEnqueueNDRangeKernelFcnTy( + cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, + const size_t *global_work_offset, const size_t *global_work_size, + const size_t *local_work_size, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); +static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr; + +typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue command_queue, + cl_mem buffer, cl_bool blocking_read, + size_t offset, size_t size, void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event); +static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr; + +typedef cl_int clFlushFcnTy(cl_command_queue command_queue); +static clFlushFcnTy *clFlushFcnPtr; + +typedef cl_int clFinishFcnTy(cl_command_queue command_queue); +static clFinishFcnTy *clFinishFcnPtr; + +typedef cl_int clReleaseKernelFcnTy(cl_kernel kernel); +static clReleaseKernelFcnTy *clReleaseKernelFcnPtr; + +typedef cl_int clReleaseProgramFcnTy(cl_program program); +static clReleaseProgramFcnTy *clReleaseProgramFcnPtr; + +typedef cl_int clReleaseMemObjectFcnTy(cl_mem memobject); +static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr; + +typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue command_queue); +static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr; + +typedef cl_int clReleaseContextFcnTy(cl_context context); +static clReleaseContextFcnTy *clReleaseContextFcnPtr; + +static void *getAPIHandleCL(void *Handle, const char *FuncName) { + char *Err; + void *FuncPtr; + dlerror(); + FuncPtr = dlsym(Handle, FuncName); + if ((Err = dlerror()) != 0) { + fprintf(stdout, "Load OpenCL Runtime API failed: %s. \n", Err); + return 0; + } + return FuncPtr; +} + +static int initialDeviceAPILibrariesCL() { + HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY); + if (!HandleOpenCL) { + printf("Cannot open library: %s. \n", dlerror()); + return 0; + } + return 1; +} + +static int initialDeviceAPIsCL() { + if (initialDeviceAPILibrariesCL() == 0) + return 0; + + /* Get function pointer to OpenCL Runtime API. + * + * Note that compilers conforming to the ISO C standard are required to + * generate a warning if a conversion from a void * pointer to a function + * pointer is attempted as in the following statements. The warning + * of this kind of cast may not be emitted by clang and new versions of gcc + * as it is valid on POSIX 2008. + */ + clGetPlatformIDsFcnPtr = + (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs"); + + clGetDeviceIDsFcnPtr = + (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs"); + + clGetDeviceInfoFcnPtr = + (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo"); + + clGetKernelInfoFcnPtr = + (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo"); + + clCreateContextFcnPtr = + (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext"); + + clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL( + HandleOpenCL, "clCreateCommandQueue"); + + clCreateBufferFcnPtr = + (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer"); + + clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL( + HandleOpenCL, "clEnqueueWriteBuffer"); + + clCreateProgramWithBinaryFcnPtr = + (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL( + HandleOpenCL, "clCreateProgramWithBinary"); + + clBuildProgramFcnPtr = + (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram"); + + clCreateKernelFcnPtr = + (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel"); + + clSetKernelArgFcnPtr = + (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg"); + + clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL( + HandleOpenCL, "clEnqueueNDRangeKernel"); + + clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL( + HandleOpenCL, "clEnqueueReadBuffer"); + + clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush"); + + clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish"); + + clReleaseKernelFcnPtr = + (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel"); + + clReleaseProgramFcnPtr = + (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram"); + + clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL( + HandleOpenCL, "clReleaseMemObject"); + + clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL( + HandleOpenCL, "clReleaseCommandQueue"); + + clReleaseContextFcnPtr = + (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext"); + + return 1; +} + +/* Context and Device. */ +static PollyGPUContext *GlobalContext = NULL; +static cl_device_id GlobalDeviceID = NULL; + +/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */ +void cl_printError(int error); + +PollyGPUContext *initContextCL() { + dump_function(); + + PollyGPUContext *Context; + + cl_platform_id platform_id = NULL; + cl_device_id device_id = NULL; + cl_uint num_platforms; + cl_uint num_devices; + cl_int ret; + + char DeviceRevision[256]; + char DeviceName[256]; + size_t DeviceRevisionRetSize, DeviceNameRetSize; + + static __thread PollyGPUContext *CurrentContext = NULL; + + if (CurrentContext) + return CurrentContext; + + /* Get API handles. */ + if (initialDeviceAPIsCL() == 0) { + fprintf(stdout, "Getting the \"handle\" for the OpenCL Runtime failed.\n"); + exit(-1); + } + + /* Get number of devices that support OpenCL. */ + ret = clGetPlatformIDsFcnPtr(1, &platform_id, &num_platforms); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to get platform IDs.\n"); + cl_printError(ret); + exit(-1); + } + // TODO: Extend to CL_DEVICE_TYPE_ALL? + ret = clGetDeviceIDsFcnPtr(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, + &num_devices); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to get device IDs.\n"); + cl_printError(ret); + exit(-1); + } + + GlobalDeviceID = device_id; + if (num_devices == 0) { + fprintf(stdout, "There is no device supporting OpenCL.\n"); + exit(-1); + } + + /* Get device revision. */ + ret = clGetDeviceInfoFcnPtr(device_id, CL_DEVICE_VERSION, 256, DeviceRevision, + &DeviceRevisionRetSize); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to fetch device revision.\n"); + cl_printError(ret); + exit(-1); + } + + /* Get device name. */ + ret = clGetDeviceInfoFcnPtr(device_id, CL_DEVICE_NAME, 256, DeviceName, + &DeviceNameRetSize); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to fetch device name.\n"); + cl_printError(ret); + exit(-1); + } + + debug_print("> Running on GPU device %d : %s.\n", device_id, DeviceName); + + /* Create context on the device. */ + Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); + if (Context == 0) { + fprintf(stdout, "Allocate memory for Polly GPU context failed.\n"); + exit(-1); + } + Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext)); + if (Context->Context == 0) { + fprintf(stdout, "Allocate memory for Polly OpenCL context failed.\n"); + exit(-1); + } + ((OpenCLContext *)Context->Context)->context = + clCreateContextFcnPtr(NULL, 1, &device_id, NULL, NULL, &ret); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to create context.\n"); + cl_printError(ret); + exit(-1); + } + + ((OpenCLContext *)Context->Context)->command_queue = + clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->context, + device_id, 0, &ret); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to create command queue.\n"); + cl_printError(ret); + exit(-1); + } + + if (CacheMode) + CurrentContext = Context; + + GlobalContext = Context; + return Context; +} + +static void freeKernelCL(PollyGPUFunction *Kernel) { + dump_function(); + + if (CacheMode) + return; + + if (!GlobalContext) { + fprintf(stdout, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + cl_int ret; + ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->command_queue); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to flush command queue.\n"); + cl_printError(ret); + exit(-1); + } + ret = + clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->command_queue); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to flush command queue.\n"); + cl_printError(ret); + exit(-1); + } + + if (((OpenCLKernel *)Kernel->Kernel)->kernel) { + cl_int ret = + clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->kernel); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to release kernel.\n"); + cl_printError(ret); + exit(-1); + } + } + + if (((OpenCLKernel *)Kernel->Kernel)->program) { + cl_int ret = + clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->program); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to release program.\n"); + cl_printError(ret); + exit(-1); + } + } + + if (Kernel->Kernel) + free((OpenCLKernel *)Kernel->Kernel); + + if (Kernel) + free(Kernel); +} + +PollyGPUFunction *getKernelCL(const char *PTXBuffer, const char *KernelName) { + dump_function(); + + if (!GlobalContext) { + fprintf(stdout, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; + static __thread int NextCacheItem = 0; + + for (long i = 0; i < KERNEL_CACHE_SIZE; i++) { + // We exploit here the property that all Polly-ACC kernels are allocated + // as global constants, hence a pointer comparision is sufficient to + // determin equality. + if (KernelCache[i] && + ((OpenCLKernel *)KernelCache[i]->Kernel)->PTXString == PTXBuffer) { + debug_print(" -> using cached kernel\n"); + return KernelCache[i]; + } + } + + PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); + if (Function == 0) { + fprintf(stdout, "Allocate memory for Polly GPU function failed.\n"); + exit(-1); + } + Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel)); + if (Function->Kernel == 0) { + fprintf(stdout, "Allocate memory for Polly OpenCL kernel failed.\n"); + exit(-1); + } + + if (!GlobalDeviceID) { + fprintf(stdout, "GPGPU-code generation not initialized correctly.\n"); + exit(-1); + } + + cl_int bin_status; + cl_int ret; + size_t binary_size = strlen(PTXBuffer); + ((OpenCLKernel *)Function->Kernel)->program = clCreateProgramWithBinaryFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->context, 1, &GlobalDeviceID, + (const size_t *)&binary_size, (const unsigned char **)&PTXBuffer, + &bin_status, &ret); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to create program from binary.\n"); + cl_printError(ret); + exit(-1); + } + + ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->program, 1, + &GlobalDeviceID, NULL, NULL, NULL); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to build program.\n"); + cl_printError(ret); + exit(-1); + } + + ((OpenCLKernel *)Function->Kernel)->kernel = clCreateKernelFcnPtr( + ((OpenCLKernel *)Function->Kernel)->program, KernelName, &ret); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to create kernel.\n"); + cl_printError(ret); + exit(-1); + } + + ((OpenCLKernel *)Function->Kernel)->PTXString = PTXBuffer; + + if (CacheMode) { + if (KernelCache[NextCacheItem]) + freeKernelCL(KernelCache[NextCacheItem]); + + KernelCache[NextCacheItem] = Function; + + NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE; + } + + return Function; +} + +void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { + dump_function(); + + if (!GlobalContext) { + fprintf(stdout, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + cl_int ret; + ret = clEnqueueWriteBufferFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->command_queue, + ((OpenCLDevicePtr *)DevData->DevicePtr)->mem, CL_TRUE, 0, MemSize, + HostData, 0, NULL, NULL); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Copying data from host memory to device failed.\n"); + cl_printError(ret); + exit(-1); + } +} + +void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { + dump_function(); + + if (!GlobalContext) { + fprintf(stdout, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + cl_int ret; + ret = clEnqueueReadBufferFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->command_queue, + ((OpenCLDevicePtr *)DevData->DevicePtr)->mem, CL_TRUE, 0, MemSize, + HostData, 0, NULL, NULL); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Copying results from device to host memory failed.\n"); + cl_printError(ret); + exit(-1); + } +} + +void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { + dump_function(); + + cl_int ret; + cl_uint num_args; + + if (!GlobalContext) { + fprintf(stdout, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel; + ret = clGetKernelInfoFcnPtr(CLKernel->kernel, CL_KERNEL_NUM_ARGS, + sizeof(cl_uint), &num_args, NULL); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to get number of kernel arguments.\n"); + cl_printError(ret); + exit(-1); + } + + for (cl_uint i = 0; i < num_args; i++) { + debug_print("Setting arg %d.\n", i); + ret = clSetKernelArgFcnPtr(CLKernel->kernel, i, 8, (void *)Parameters[i]); + if (ret == CL_INVALID_ARG_SIZE) { + debug_print("Arg %d with 8 bytes failed, trying 4.\n", i); + ret = clSetKernelArgFcnPtr(CLKernel->kernel, i, 4, (void *)Parameters[i]); + if (ret == CL_INVALID_ARG_SIZE) { + debug_print("Arg %d with 4 bytes failed, trying 2.\n", i); + ret = + clSetKernelArgFcnPtr(CLKernel->kernel, i, 2, (void *)Parameters[i]); + if (ret == CL_INVALID_ARG_SIZE) { + debug_print("Arg %d with 2 bytes failed, trying 1.\n", i); + ret = clSetKernelArgFcnPtr(CLKernel->kernel, i, 1, + (void *)Parameters[i]); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to set Kernel argument %d.\n", i); + cl_printError(ret); + exit(-1); + } + } + } + } + if (ret != CL_SUCCESS && ret != CL_INVALID_ARG_SIZE) { + fprintf(stdout, "Failed to set Kernel argument.\n"); + cl_printError(ret); + exit(-1); + } + } + + unsigned int GridDimZ = 1; + size_t global_work_size[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY, + BlockDimZ * GridDimZ}; + size_t local_work_size[3] = {BlockDimX, BlockDimY, BlockDimZ}; + + OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context; + ret = clEnqueueNDRangeKernelFcnPtr(CLContext->command_queue, CLKernel->kernel, + 3, NULL, global_work_size, local_work_size, + 0, NULL, NULL); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Launching OpenCL kernel failed.\n"); + cl_printError(ret); + exit(-1); + } +} + +void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) { + dump_function(); + + OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; + cl_int ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->mem); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Failed to free device memory.\n"); + cl_printError(ret); + exit(-1); + } + + free(DevPtr); + free(Allocation); +} + +PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) { + dump_function(); + + if (!GlobalContext) { + fprintf(stdout, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); + if (DevData == 0) { + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } + DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr)); + if (DevData->DevicePtr == 0) { + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } + + cl_int ret; + ((OpenCLDevicePtr *)DevData->DevicePtr)->mem = + clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->context, + CL_MEM_READ_WRITE, MemSize, NULL, &ret); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + cl_printError(ret); + exit(-1); + } + + return DevData; +} + +void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) { + dump_function(); + + OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; + return (void *)DevPtr->mem; +} + +void freeContextCL(PollyGPUContext *Context) { + dump_function(); + + cl_int ret; + + GlobalContext = NULL; + + OpenCLContext *Ctx = (OpenCLContext *)Context->Context; + if (Ctx->command_queue) { + ret = clReleaseCommandQueueFcnPtr(Ctx->command_queue); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Could not release command queue.\n"); + cl_printError(ret); + exit(-1); + } + } + + if (Ctx->context) { + ret = clReleaseContextFcnPtr(Ctx->context); + if (ret != CL_SUCCESS) { + fprintf(stdout, "Could not release context.\n"); + cl_printError(ret); + exit(-1); + } + } + + free(Ctx); + free(Context); +} + +void cl_printError(int error) { + + switch (error) { + case CL_SUCCESS: + // Success, don't print an error. + break; + + // JIT/Runtime errors. + case CL_DEVICE_NOT_FOUND: + fprintf(stdout, "Device not found.\n"); + break; + case CL_DEVICE_NOT_AVAILABLE: + fprintf(stdout, "Device not available.\n"); + break; + case CL_COMPILER_NOT_AVAILABLE: + fprintf(stdout, "Compiler not available.\n"); + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + fprintf(stdout, "Mem object allocation failure.\n"); + break; + case CL_OUT_OF_RESOURCES: + fprintf(stdout, "Out of resources.\n"); + break; + case CL_OUT_OF_HOST_MEMORY: + fprintf(stdout, "Out of host memory.\n"); + break; + case CL_PROFILING_INFO_NOT_AVAILABLE: + fprintf(stdout, "Profiling info not available.\n"); + break; + case CL_MEM_COPY_OVERLAP: + fprintf(stdout, "Mem copy overlap.\n"); + break; + case CL_IMAGE_FORMAT_MISMATCH: + fprintf(stdout, "Image format mismatch.\n"); + break; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: + fprintf(stdout, "Image format not supported.\n"); + break; + case CL_BUILD_PROGRAM_FAILURE: + fprintf(stdout, "Build program failure.\n"); + break; + case CL_MAP_FAILURE: + fprintf(stdout, "Map failure.\n"); + break; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: + fprintf(stdout, "Misaligned sub buffer offset.\n"); + break; + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: + fprintf(stdout, "Exec status error for events in wait list.\n"); + break; + case CL_COMPILE_PROGRAM_FAILURE: + fprintf(stdout, "Compile program failure.\n"); + break; + case CL_LINKER_NOT_AVAILABLE: + fprintf(stdout, "Linker not available.\n"); + break; + case CL_LINK_PROGRAM_FAILURE: + fprintf(stdout, "Link program failure.\n"); + break; + case CL_DEVICE_PARTITION_FAILED: + fprintf(stdout, "Device partition failed.\n"); + break; + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: + fprintf(stdout, "Kernel arg info not available.\n"); + break; + + // Compiler errors. + case CL_INVALID_VALUE: + fprintf(stdout, "Invalid value.\n"); + break; + case CL_INVALID_DEVICE_TYPE: + fprintf(stdout, "Invalid device type.\n"); + break; + case CL_INVALID_PLATFORM: + fprintf(stdout, "Invalid platform.\n"); + break; + case CL_INVALID_DEVICE: + fprintf(stdout, "Invalid device.\n"); + break; + case CL_INVALID_CONTEXT: + fprintf(stdout, "Invalid context.\n"); + break; + case CL_INVALID_QUEUE_PROPERTIES: + fprintf(stdout, "Invalid queue properties.\n"); + break; + case CL_INVALID_COMMAND_QUEUE: + fprintf(stdout, "Invalid command queue.\n"); + break; + case CL_INVALID_HOST_PTR: + fprintf(stdout, "Invalid host pointer.\n"); + break; + case CL_INVALID_MEM_OBJECT: + fprintf(stdout, "Invalid memory object.\n"); + break; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: + fprintf(stdout, "Invalid image format descriptor.\n"); + break; + case CL_INVALID_IMAGE_SIZE: + fprintf(stdout, "Invalid image size.\n"); + break; + case CL_INVALID_SAMPLER: + fprintf(stdout, "Invalid sampler.\n"); + break; + case CL_INVALID_BINARY: + fprintf(stdout, "Invalid binary.\n"); + break; + case CL_INVALID_BUILD_OPTIONS: + fprintf(stdout, "Invalid build options.\n"); + break; + case CL_INVALID_PROGRAM: + fprintf(stdout, "Invalid program.\n"); + break; + case CL_INVALID_PROGRAM_EXECUTABLE: + fprintf(stdout, "Invalid program executable.\n"); + break; + case CL_INVALID_KERNEL_NAME: + fprintf(stdout, "Invalid kernel name.\n"); + break; + case CL_INVALID_KERNEL_DEFINITION: + fprintf(stdout, "Invalid kernel definition.\n"); + break; + case CL_INVALID_KERNEL: + fprintf(stdout, "Invalid kernel.\n"); + break; + case CL_INVALID_ARG_INDEX: + fprintf(stdout, "Invalid arg index.\n"); + break; + case CL_INVALID_ARG_VALUE: + fprintf(stdout, "Invalid arg value.\n"); + break; + case CL_INVALID_ARG_SIZE: + fprintf(stdout, "Invalid arg size.\n"); + break; + case CL_INVALID_KERNEL_ARGS: + fprintf(stdout, "Invalid kernel args.\n"); + break; + case CL_INVALID_WORK_DIMENSION: + fprintf(stdout, "Invalid work dimension.\n"); + break; + case CL_INVALID_WORK_GROUP_SIZE: + fprintf(stdout, "Invalid work group size.\n"); + break; + case CL_INVALID_WORK_ITEM_SIZE: + fprintf(stdout, "Invalid work item size.\n"); + break; + case CL_INVALID_GLOBAL_OFFSET: + fprintf(stdout, "Invalid global offset.\n"); + break; + case CL_INVALID_EVENT_WAIT_LIST: + fprintf(stdout, "Invalid event wait list.\n"); + break; + case CL_INVALID_EVENT: + fprintf(stdout, "Invalid event.\n"); + break; + case CL_INVALID_OPERATION: + fprintf(stdout, "Invalid operation.\n"); + break; + case CL_INVALID_GL_OBJECT: + fprintf(stdout, "Invalid GL object.\n"); + break; + case CL_INVALID_BUFFER_SIZE: + fprintf(stdout, "Invalid buffer size.\n"); + break; + case CL_INVALID_MIP_LEVEL: + fprintf(stdout, "Invalid mip level.\n"); + break; + case CL_INVALID_GLOBAL_WORK_SIZE: + fprintf(stdout, "Invalid global work size.\n"); + break; + case CL_INVALID_PROPERTY: + fprintf(stdout, "Invalid property.\n"); + break; + case CL_INVALID_IMAGE_DESCRIPTOR: + fprintf(stdout, "Invalid image descriptor.\n"); + break; + case CL_INVALID_COMPILER_OPTIONS: + fprintf(stdout, "Invalid compiler options.\n"); + break; + case CL_INVALID_LINKER_OPTIONS: + fprintf(stdout, "Invalid linker options.\n"); + break; + case CL_INVALID_DEVICE_PARTITION_COUNT: + fprintf(stdout, "Invalid device partition count.\n"); + break; + case CL_INVALID_PIPE_SIZE: + fprintf(stdout, "Invalid pipe size.\n"); + break; + case CL_INVALID_DEVICE_QUEUE: + fprintf(stdout, "Invalid device queue.\n"); + break; + + // NVIDIA specific error. + case -9999: + fprintf(stdout, "NVIDIA invalid read or write buffer.\n"); + break; + + default: + fprintf(stdout, "Unknown error code!\n"); + break; + } +} + +/******************************************************************************/ +/* CUDA */ +/******************************************************************************/ + +struct CUDAContextT { + CUcontext Cuda; +}; + +struct CUDAKernelT { CUfunction Cuda; CUmodule CudaModule; const char *PTXString; }; -struct PollyGPUDevicePtrT { +struct CUDADevicePtrT { CUdeviceptr Cuda; }; @@ -134,7 +1032,7 @@ typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; -static void *getAPIHandle(void *Handle, const char *FuncName) { +static void *getAPIHandleCUDA(void *Handle, const char *FuncName) { char *Err; void *FuncPtr; dlerror(); @@ -146,7 +1044,7 @@ return FuncPtr; } -static int initialDeviceAPILibraries() { +static int initialDeviceAPILibrariesCUDA() { HandleCuda = dlopen("libcuda.so", RTLD_LAZY); if (!HandleCuda) { printf("Cannot open library: %s. \n", dlerror()); @@ -162,8 +1060,8 @@ return 1; } -static int initialDeviceAPIs() { - if (initialDeviceAPILibraries() == 0) +static int initialDeviceAPIsCUDA() { + if (initialDeviceAPILibrariesCUDA() == 0) return 0; /* Get function pointer to CUDA Driver APIs. @@ -175,75 +1073,75 @@ * as it is valid on POSIX 2008. */ CuLaunchKernelFcnPtr = - (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel"); + (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel"); CuMemAllocFcnPtr = - (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2"); + (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2"); - CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2"); + CuMemFreeFcnPtr = + (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2"); CuMemcpyDtoHFcnPtr = - (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2"); + (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2"); CuMemcpyHtoDFcnPtr = - (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2"); + (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2"); CuModuleUnloadFcnPtr = - (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload"); + (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload"); CuCtxDestroyFcnPtr = - (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy"); + (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy"); - CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit"); + CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit"); CuDeviceGetCountFcnPtr = - (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount"); + (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount"); CuDeviceGetFcnPtr = - (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet"); + (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet"); CuCtxCreateFcnPtr = - (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2"); + (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2"); - CuModuleLoadDataExFcnPtr = - (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx"); + CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA( + HandleCuda, "cuModuleLoadDataEx"); CuModuleLoadDataFcnPtr = - (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData"); + (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData"); - CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle( + CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA( HandleCuda, "cuModuleGetFunction"); CuDeviceComputeCapabilityFcnPtr = - (CuDeviceComputeCapabilityFcnTy *)getAPIHandle( + (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA( HandleCuda, "cuDeviceComputeCapability"); CuDeviceGetNameFcnPtr = - (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName"); + (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName"); CuLinkAddDataFcnPtr = - (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData"); + (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData"); CuLinkCreateFcnPtr = - (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate"); + (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate"); CuLinkCompleteFcnPtr = - (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete"); + (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete"); CuLinkDestroyFcnPtr = - (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy"); + (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy"); /* Get function pointer to CUDA Runtime APIs. */ - CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle( + CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA( HandleCudaRT, "cudaThreadSynchronize"); return 1; } -PollyGPUContext *polly_initContext() { - DebugMode = getenv("POLLY_DEBUG") != 0; - +PollyGPUContext *initContextCUDA() { dump_function(); + PollyGPUContext *Context; CUdevice Device; @@ -257,7 +1155,7 @@ return CurrentContext; /* Get API handles. */ - if (initialDeviceAPIs() == 0) { + if (initialDeviceAPIsCUDA() == 0) { fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n"); exit(-1); } @@ -287,9 +1185,12 @@ fprintf(stdout, "Allocate memory for Polly GPU context failed.\n"); exit(-1); } - CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device); - - CacheMode = getenv("POLLY_NOCACHE") == 0; + Context->Context = malloc(sizeof(CUDAContext)); + if (Context->Context == 0) { + fprintf(stdout, "Allocate memory for Polly CUDA context failed.\n"); + exit(-1); + } + CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, Device); if (CacheMode) CurrentContext = Context; @@ -297,18 +1198,23 @@ return Context; } -static void freeKernel(PollyGPUFunction *Kernel) { - if (Kernel->CudaModule) - CuModuleUnloadFcnPtr(Kernel->CudaModule); +static void freeKernelCUDA(PollyGPUFunction *Kernel) { + dump_function(); + + if (CacheMode) + return; + + if (((CUDAKernel *)Kernel->Kernel)->CudaModule) + CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule); + + if (Kernel->Kernel) + free((CUDAKernel *)Kernel->Kernel); if (Kernel) free(Kernel); } -#define KERNEL_CACHE_SIZE 10 - -PollyGPUFunction *polly_getKernel(const char *PTXBuffer, - const char *KernelName) { +PollyGPUFunction *getKernelCUDA(const char *PTXBuffer, const char *KernelName) { dump_function(); static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; @@ -318,18 +1224,23 @@ // We exploit here the property that all Polly-ACC kernels are allocated // as global constants, hence a pointer comparision is sufficient to // determin equality. - if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) { + if (KernelCache[i] && + ((CUDAKernel *)KernelCache[i]->Kernel)->PTXString == PTXBuffer) { debug_print(" -> using cached kernel\n"); return KernelCache[i]; } } PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); - if (Function == 0) { fprintf(stdout, "Allocate memory for Polly GPU function failed.\n"); exit(-1); } + Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel)); + if (Function->Kernel == 0) { + fprintf(stdout, "Allocate memory for Polly CUDA function failed.\n"); + exit(-1); + } CUresult Res; CUlinkState LState; @@ -381,13 +1292,15 @@ debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime, InfoLog); - Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut); + Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule), + CuOut); if (Res != CUDA_SUCCESS) { fprintf(stdout, "Loading ptx assembly text failed.\n"); exit(-1); } - Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule, + Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda), + ((CUDAKernel *)Function->Kernel)->CudaModule, KernelName); if (Res != CUDA_SUCCESS) { fprintf(stdout, "Loading kernel function failed.\n"); @@ -396,11 +1309,11 @@ CuLinkDestroyFcnPtr(LState); - Function->PTXString = PTXBuffer; + ((CUDAKernel *)Function->Kernel)->PTXString = PTXBuffer; if (CacheMode) { if (KernelCache[NextCacheItem]) - freeKernel(KernelCache[NextCacheItem]); + freeKernelCUDA(KernelCache[NextCacheItem]); KernelCache[NextCacheItem] = Function; @@ -410,37 +1323,29 @@ return Function; } -void polly_freeKernel(PollyGPUFunction *Kernel) { - dump_function(); - - if (CacheMode) - return; - - freeKernel(Kernel); -} - -void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, - long MemSize) { +void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { dump_function(); - CUdeviceptr CuDevData = DevData->Cuda; + CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda; CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize); } -void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, - long MemSize) { +void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { dump_function(); - if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) { + if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda, + MemSize) != CUDA_SUCCESS) { fprintf(stdout, "Copying results from device to host memory failed.\n"); exit(-1); } } -void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, - unsigned int GridDimY, unsigned int BlockDimX, - unsigned int BlockDimY, unsigned int BlockDimZ, - void **Parameters) { +void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { dump_function(); unsigned GridDimZ = 1; @@ -449,32 +1354,40 @@ void **Extra = 0; CUresult Res; - Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ, - BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes, - Stream, Parameters, Extra); + Res = + CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX, + GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ, + SharedMemBytes, Stream, Parameters, Extra); if (Res != CUDA_SUCCESS) { fprintf(stdout, "Launching CUDA kernel failed.\n"); exit(-1); } } -void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { +void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) { dump_function(); - CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda); + CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; + CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda); + free(DevPtr); free(Allocation); } -PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { +PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) { dump_function(); PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); - if (DevData == 0) { fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); exit(-1); } + DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr)); + if (DevData->DevicePtr == 0) { + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } - CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize); + CUresult Res = + CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize); if (Res != CUDA_SUCCESS) { fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); @@ -484,10 +1397,191 @@ return DevData; } +void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) { + dump_function(); + + CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; + return (void *)DevPtr->Cuda; +} + +void freeContextCUDA(PollyGPUContext *Context) { + dump_function(); + + CUDAContext *Ctx = (CUDAContext *)Context->Context; + if (Ctx->Cuda) { + CuCtxDestroyFcnPtr(Ctx->Cuda); + free(Ctx); + free(Context); + } + + dlclose(HandleCuda); + dlclose(HandleCudaRT); +} + +/******************************************************************************/ +/* API */ +/******************************************************************************/ + +PollyGPUContext *polly_initContext() { + DebugMode = getenv("POLLY_DEBUG") != 0; + CacheMode = getenv("POLLY_NOCACHE") == 0; + + dump_function(); + + PollyGPUContext *Context; + + switch (Runtime) { + case RUNTIME_CUDA: + Context = initContextCUDA(); + break; + case RUNTIME_CL: + Context = initContextCL(); + break; + default: + err_runtime(); + } + + return Context; +} + +void polly_freeKernel(PollyGPUFunction *Kernel) { + dump_function(); + + switch (Runtime) { + case RUNTIME_CUDA: + freeKernelCUDA(Kernel); + break; + case RUNTIME_CL: + freeKernelCL(Kernel); + break; + default: + err_runtime(); + } +} + +PollyGPUFunction *polly_getKernel(const char *PTXBuffer, + const char *KernelName) { + dump_function(); + + PollyGPUFunction *Function; + + switch (Runtime) { + case RUNTIME_CUDA: + Function = getKernelCUDA(PTXBuffer, KernelName); + break; + case RUNTIME_CL: + Function = getKernelCL(PTXBuffer, KernelName); + break; + default: + err_runtime(); + } + + return Function; +} + +void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { + dump_function(); + + switch (Runtime) { + case RUNTIME_CUDA: + copyFromHostToDeviceCUDA(HostData, DevData, MemSize); + break; + case RUNTIME_CL: + copyFromHostToDeviceCL(HostData, DevData, MemSize); + break; + default: + err_runtime(); + } +} + +void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { + dump_function(); + + switch (Runtime) { + case RUNTIME_CUDA: + copyFromDeviceToHostCUDA(DevData, HostData, MemSize); + break; + case RUNTIME_CL: + copyFromDeviceToHostCL(DevData, HostData, MemSize); + break; + default: + err_runtime(); + } +} + +void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { + dump_function(); + + switch (Runtime) { + case RUNTIME_CUDA: + launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, + BlockDimZ, Parameters); + break; + case RUNTIME_CL: + launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ, + Parameters); + break; + default: + err_runtime(); + } +} + +void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { + dump_function(); + + switch (Runtime) { + case RUNTIME_CUDA: + freeDeviceMemoryCUDA(Allocation); + break; + case RUNTIME_CL: + freeDeviceMemoryCL(Allocation); + break; + default: + err_runtime(); + } +} + +PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { + dump_function(); + + PollyGPUDevicePtr *DevData; + + switch (Runtime) { + case RUNTIME_CUDA: + DevData = allocateMemoryForDeviceCUDA(MemSize); + break; + case RUNTIME_CL: + DevData = allocateMemoryForDeviceCL(MemSize); + break; + default: + err_runtime(); + } + + return DevData; +} + void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) { dump_function(); - return (void *)Allocation->Cuda; + void *DevPtr; + + switch (Runtime) { + case RUNTIME_CUDA: + DevPtr = getDevicePtrCUDA(Allocation); + break; + case RUNTIME_CL: + DevPtr = getDevicePtrCL(Allocation); + break; + default: + err_runtime(); + } + + return DevPtr; } void polly_freeContext(PollyGPUContext *Context) { @@ -496,11 +1590,26 @@ if (CacheMode) return; - if (Context->Cuda) { - CuCtxDestroyFcnPtr(Context->Cuda); - free(Context); + switch (Runtime) { + case RUNTIME_CUDA: + freeContextCUDA(Context); + break; + case RUNTIME_CL: + freeContextCL(Context); + break; + default: + err_runtime(); } +} - dlclose(HandleCuda); - dlclose(HandleCudaRT); +/* Initialize GPUJIT with CUDA as runtime library. */ +PollyGPUContext *polly_initContextCUDA() { + Runtime = RUNTIME_CUDA; + return polly_initContext(); +} + +/* Initialize GPUJIT with OpenCL as runtime library. */ +PollyGPUContext *polly_initContextCL() { + Runtime = RUNTIME_CL; + return polly_initContext(); }