Index: CMakeLists.txt =================================================================== --- CMakeLists.txt +++ CMakeLists.txt @@ -152,9 +152,10 @@ option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF) if (POLLY_ENABLE_GPGPU_CODEGEN) - # Do not require CUDA, as GPU code generation test cases can be run without - # a cuda library. + # Do not require CUDA/OpenCL, as GPU code generation test cases can be run + # without a CUDA/OpenCL library. FIND_PACKAGE(CUDA) + FIND_PACKAGE(OpenCL) set(GPU_CODEGEN TRUE) else(POLLY_ENABLE_GPGPU_CODEGEN) set(GPU_CODEGEN FALSE) @@ -162,9 +163,10 @@ # Support GPGPU code generation if the library is available. -if (CUDALIB_FOUND) +if (CUDALIB_FOUND AND OpenCL_FOUND) INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} ) -endif(CUDALIB_FOUND) + INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} ) +endif(CUDALIB_FOUND AND OpenCL_FOUND) option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON) if (NOT POLLY_BUNDLED_ISL) Index: include/polly/LinkAllPasses.h =================================================================== --- include/polly/LinkAllPasses.h +++ include/polly/LinkAllPasses.h @@ -48,7 +48,7 @@ llvm::Pass *createIslAstInfoPass(); llvm::Pass *createCodeGenerationPass(); #ifdef GPU_CODEGEN -llvm::Pass *createPPCGCodeGenerationPass(); +llvm::Pass *createPPCGCodeGenerationPass(int Runtime = 0); #endif llvm::Pass *createIslScheduleOptimizerPass(); llvm::Pass *createFlattenSchedulePass(); Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -53,6 +53,9 @@ #define DEBUG_TYPE "polly-codegen-ppcg" +/// The GPU Runtime implementation to use. +enum GPURuntime { GPURuntime_CUDA, GPURuntime_OpenCL }; + static cl::opt DumpSchedule("polly-acc-dump-schedule", cl::desc("Dump the computed GPU Schedule"), cl::Hidden, cl::init(false), cl::ZeroOrMore, @@ -146,9 +149,9 @@ GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, Scop &S, BasicBlock *StartBlock, - gpu_prog *Prog) + gpu_prog *Prog, GPURuntime Runtime) : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), - Prog(Prog) { + Prog(Prog), Runtime(Runtime) { getExprBuilder().setIDToSAI(&IDToSAI); } @@ -194,6 +197,9 @@ /// The GPU program we generate code for. gpu_prog *Prog; + /// The GPU Runtime implementation to use (OpenCL or CUDA). + GPURuntime Runtime; + /// Class to free isl_ids. class IslIdDeleter { public: @@ -700,7 +706,13 @@ } Value *GPUNodeBuilder::createCallInitContext() { - const char *Name = "polly_initContext"; + const char *Name; + + if (Runtime == GPURuntime_CUDA) + Name = "polly_initContextCUDA"; + else if (Runtime == GPURuntime_OpenCL) + Name = "polly_initContextCL"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -1543,10 +1555,14 @@ void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, SetVector &SubtreeValues) { - std::string Identifier = "kernel_" + std::to_string(Kernel->id); GPUModule.reset(new Module(Identifier, Builder.getContext())); - GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); + + if (Runtime == GPURuntime_CUDA) + GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); + else if (Runtime == GPURuntime_OpenCL) + GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); + GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); @@ -1568,7 +1584,13 @@ } std::string GPUNodeBuilder::createKernelASM() { - llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); + llvm::Triple GPUTriple; + + if (Runtime == GPURuntime_CUDA) + GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda")); + else if (Runtime == GPURuntime_OpenCL) + GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl")); + std::string ErrMsg; auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); @@ -1633,6 +1655,8 @@ public: static char ID; + GPURuntime Runtime = GPURuntime_CUDA; + /// The scop that is currently processed. Scop *S; @@ -2416,7 +2440,7 @@ executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, - StartBlock, Prog); + StartBlock, Prog, Runtime); // TODO: Handle LICM auto SplitBlock = StartBlock->getSinglePredecessor(); @@ -2504,7 +2528,18 @@ char PPCGCodeGeneration::ID = 1; -Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } +Pass *polly::createPPCGCodeGenerationPass(int Runtime) { + PPCGCodeGeneration *generator = new PPCGCodeGeneration(); + switch (Runtime) { + case 1: + generator->Runtime = GPURuntime_CUDA; + break; + case 2: + generator->Runtime = GPURuntime_OpenCL; + break; + } + return generator; +} INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", "Polly - Apply PPCG translation to SCOP", false, false) Index: lib/Support/RegisterPasses.cpp =================================================================== --- lib/Support/RegisterPasses.cpp +++ lib/Support/RegisterPasses.cpp @@ -101,6 +101,16 @@ ), cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory)); +#ifdef GPU_CODEGEN +enum GPURuntimeChoice { GPU_RUNTIME_CUDA, GPU_RUNTIME_OPENCL }; +static cl::opt GPURuntime( + "polly-gpu-runtime", cl::desc("The GPU Runtime API to target"), + cl::values(clEnumValN(GPU_RUNTIME_CUDA, "cuda", "use the CUDA Runtime API"), + clEnumValN(GPU_RUNTIME_OPENCL, "opencl", + "use the OpenCL Runtime API")), + cl::init(GPU_RUNTIME_CUDA), cl::ZeroOrMore, cl::cat(PollyCategory)); +#endif + VectorizerChoice polly::PollyVectorizerChoice; static cl::opt Vectorizer( "polly-vectorizer", cl::desc("Select the vectorization strategy"), @@ -309,7 +319,16 @@ if (Target == TARGET_GPU) { #ifdef GPU_CODEGEN - PM.add(polly::createPPCGCodeGenerationPass()); + switch (GPURuntime) { + case GPU_RUNTIME_CUDA: + static const int UseCUDARuntime = 1; + PM.add(polly::createPPCGCodeGenerationPass(UseCUDARuntime)); + break; + case GPU_RUNTIME_OPENCL: + static const int UseOpenCLRuntime = 2; + PM.add(polly::createPPCGCodeGenerationPass(UseOpenCLRuntime)); + break; + } #endif } else { switch (CodeGeneration) { Index: test/GPGPU/size-cast.ll =================================================================== --- test/GPGPU/size-cast.ll +++ test/GPGPU/size-cast.ll @@ -29,7 +29,7 @@ ; CODE-NEXT: if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1) ; CODE-NEXT: Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0); -; IR: call i8* @polly_initContext() +; IR: call i8* @polly_initContextCUDA() ; IR-NEXT: sext i32 %arg to i64 ; IR-NEXT: mul i64 ; IR-NEXT: @polly_allocateMemoryForDevice Index: tools/CMakeLists.txt =================================================================== --- tools/CMakeLists.txt +++ tools/CMakeLists.txt @@ -1,5 +1,5 @@ -if (CUDALIB_FOUND) +if (CUDALIB_FOUND AND OpenCL_FOUND) add_subdirectory(GPURuntime) -endif (CUDALIB_FOUND) +endif (CUDALIB_FOUND AND OpenCL_FOUND) set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE) Index: tools/GPURuntime/GPUJIT.h =================================================================== --- tools/GPURuntime/GPUJIT.h +++ tools/GPURuntime/GPUJIT.h @@ -76,12 +76,27 @@ * */ +typedef enum PollyGPURuntimeT { + RUNTIME_NONE, + RUNTIME_CUDA, + RUNTIME_CL +} PollyGPURuntime; + typedef struct PollyGPUContextT PollyGPUContext; typedef struct PollyGPUFunctionT PollyGPUFunction; typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr; -PollyGPUContext *polly_initContext(); -PollyGPUFunction *polly_getKernel(const char *PTXBuffer, +typedef struct OpenCLContextT OpenCLContext; +typedef struct OpenCLKernelT OpenCLKernel; +typedef struct OpenCLDevicePtrT OpenCLDevicePtr; + +typedef struct CUDAContextT CUDAContext; +typedef struct CUDAKernelT CUDAKernel; +typedef struct CUDADevicePtrT CUDADevicePtr; + +PollyGPUContext *polly_initContextCUDA(); +PollyGPUContext *polly_initContextCL(); +PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, const char *KernelName); void polly_freeKernel(PollyGPUFunction *Kernel); void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -14,6 +14,13 @@ #include "GPUJIT.h" #include #include + +#ifdef __APPLE__ +#include +#else +#include +#endif + #include #include #include @@ -22,6 +29,8 @@ static int DebugMode; static int CacheMode; +static PollyGPURuntime Runtime = RUNTIME_NONE; + static void debug_print(const char *format, ...) { if (!DebugMode) return; @@ -33,18 +42,907 @@ } #define dump_function() debug_print("-> %s\n", __func__) -/* Define Polly's GPGPU data types. */ +#define KERNEL_CACHE_SIZE 10 + +static void err_runtime() { + fprintf(stdout, "Runtime not correctly initialized.\n"); + exit(-1); +} + struct PollyGPUContextT { - CUcontext Cuda; + void *Context; }; struct PollyGPUFunctionT { + void *Kernel; +}; + +struct PollyGPUDevicePtrT { + void *DevicePtr; +}; + +/******************************************************************************/ +/* OpenCL */ +/******************************************************************************/ + +struct OpenCLContextT { + cl_context Context; + cl_command_queue CommandQueue; +}; + +struct OpenCLKernelT { + cl_kernel Kernel; + cl_program Program; + const char *BinaryString; +}; + +struct OpenCLDevicePtrT { + cl_mem MemObj; +}; + +/* Dynamic library handles for the OpenCL runtime library. */ +static void *HandleOpenCL; + +/* Type-defines of function pointer to OpenCL Runtime API. */ +typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries, + cl_platform_id *Platforms, + cl_uint *NumPlatforms); +static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr; + +typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform, + cl_device_type DeviceType, + cl_uint NumEntries, cl_device_id *Devices, + cl_uint *NumDevices); +static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr; + +typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device, + cl_device_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet); +static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr; + +typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet); +static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr; + +typedef cl_context clCreateContextFcnTy( + const cl_context_properties *Properties, cl_uint NumDevices, + const cl_device_id *Devices, + void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo, + size_t CB, void *UserData), + void *UserData, cl_int *ErrcodeRet); +static clCreateContextFcnTy *clCreateContextFcnPtr; + +typedef cl_command_queue +clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device, + cl_command_queue_properties Properties, + cl_int *ErrcodeRet); +static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr; + +typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags, + size_t Size, void *HostPtr, + cl_int *ErrcodeRet); +static clCreateBufferFcnTy *clCreateBufferFcnPtr; + +typedef cl_int +clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer, + cl_bool BlockingWrite, size_t Offset, size_t Size, + const void *Ptr, cl_uint NumEventsInWaitList, + const cl_event *EventWaitList, cl_event *Event); +static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr; + +typedef cl_program clCreateProgramWithBinaryFcnTy( + cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList, + const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus, + cl_int *ErrcodeRet); +static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr; + +typedef cl_int clBuildProgramFcnTy( + cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList, + const char *Options, + void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData), + void *UserData); +static clBuildProgramFcnTy *clBuildProgramFcnPtr; + +typedef cl_kernel clCreateKernelFcnTy(cl_program Program, + const char *KernelName, + cl_int *ErrcodeRet); +static clCreateKernelFcnTy *clCreateKernelFcnPtr; + +typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex, + size_t ArgSize, const void *ArgValue); +static clSetKernelArgFcnTy *clSetKernelArgFcnPtr; + +typedef cl_int clEnqueueNDRangeKernelFcnTy( + cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, cl_uint NumEventsInWaitList, + const cl_event *EventWaitList, cl_event *Event); +static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr; + +typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue, + cl_mem Buffer, cl_bool BlockingRead, + size_t Offset, size_t Size, void *Ptr, + cl_uint NumEventsInWaitList, + const cl_event *EventWaitList, + cl_event *Event); +static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr; + +typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue); +static clFlushFcnTy *clFlushFcnPtr; + +typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue); +static clFinishFcnTy *clFinishFcnPtr; + +typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel); +static clReleaseKernelFcnTy *clReleaseKernelFcnPtr; + +typedef cl_int clReleaseProgramFcnTy(cl_program Program); +static clReleaseProgramFcnTy *clReleaseProgramFcnPtr; + +typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject); +static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr; + +typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue); +static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr; + +typedef cl_int clReleaseContextFcnTy(cl_context Context); +static clReleaseContextFcnTy *clReleaseContextFcnPtr; + +static void *getAPIHandleCL(void *Handle, const char *FuncName) { + char *Err; + void *FuncPtr; + dlerror(); + FuncPtr = dlsym(Handle, FuncName); + if ((Err = dlerror()) != 0) { + fprintf(stdout, "Load OpenCL Runtime API failed: %s. \n", Err); + return 0; + } + return FuncPtr; +} + +static int initialDeviceAPILibrariesCL() { + HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY); + if (!HandleOpenCL) { + printf("Cannot open library: %s. \n", dlerror()); + return 0; + } + return 1; +} + +static int initialDeviceAPIsCL() { + if (initialDeviceAPILibrariesCL() == 0) + return 0; + + /* Get function pointer to OpenCL Runtime API. + * + * Note that compilers conforming to the ISO C standard are required to + * generate a warning if a conversion from a void * pointer to a function + * pointer is attempted as in the following statements. The warning + * of this kind of cast may not be emitted by clang and new versions of gcc + * as it is valid on POSIX 2008. + */ + clGetPlatformIDsFcnPtr = + (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs"); + + clGetDeviceIDsFcnPtr = + (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs"); + + clGetDeviceInfoFcnPtr = + (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo"); + + clGetKernelInfoFcnPtr = + (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo"); + + clCreateContextFcnPtr = + (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext"); + + clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL( + HandleOpenCL, "clCreateCommandQueue"); + + clCreateBufferFcnPtr = + (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer"); + + clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL( + HandleOpenCL, "clEnqueueWriteBuffer"); + + clCreateProgramWithBinaryFcnPtr = + (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL( + HandleOpenCL, "clCreateProgramWithBinary"); + + clBuildProgramFcnPtr = + (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram"); + + clCreateKernelFcnPtr = + (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel"); + + clSetKernelArgFcnPtr = + (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg"); + + clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL( + HandleOpenCL, "clEnqueueNDRangeKernel"); + + clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL( + HandleOpenCL, "clEnqueueReadBuffer"); + + clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush"); + + clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish"); + + clReleaseKernelFcnPtr = + (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel"); + + clReleaseProgramFcnPtr = + (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram"); + + clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL( + HandleOpenCL, "clReleaseMemObject"); + + clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL( + HandleOpenCL, "clReleaseCommandQueue"); + + clReleaseContextFcnPtr = + (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext"); + + return 1; +} + +/* Context and Device. */ +static PollyGPUContext *GlobalContext = NULL; +static cl_device_id GlobalDeviceID = NULL; + +/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */ +void cl_printError(int Error); + +PollyGPUContext *initContextCL() { + dump_function(); + + PollyGPUContext *Context; + + cl_platform_id PlatformID = NULL; + cl_device_id DeviceID = NULL; + cl_uint NumDevicesRet; + cl_int Ret; + + char DeviceRevision[256]; + char DeviceName[256]; + size_t DeviceRevisionRetSize, DeviceNameRetSize; + + static __thread PollyGPUContext *CurrentContext = NULL; + + if (CurrentContext) + return CurrentContext; + + /* Get API handles. */ + if (initialDeviceAPIsCL() == 0) { + fprintf(stdout, "Getting the \"handle\" for the OpenCL Runtime failed.\n"); + exit(-1); + } + + /* Get number of devices that support OpenCL. */ + static const int NumberOfPlatforms = 1; + Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to get platform IDs.\n"); + cl_printError(Ret); + exit(-1); + } + // TODO: Extend to CL_DEVICE_TYPE_ALL? + static const int NumberOfDevices = 1; + Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices, + &DeviceID, &NumDevicesRet); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to get device IDs.\n"); + cl_printError(Ret); + exit(-1); + } + + GlobalDeviceID = DeviceID; + if (NumDevicesRet == 0) { + fprintf(stdout, "There is no device supporting OpenCL.\n"); + exit(-1); + } + + /* Get device revision. */ + Ret = + clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision), + DeviceRevision, &DeviceRevisionRetSize); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to fetch device revision.\n"); + cl_printError(Ret); + exit(-1); + } + + /* Get device name. */ + Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName), + DeviceName, &DeviceNameRetSize); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to fetch device name.\n"); + cl_printError(Ret); + exit(-1); + } + + debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName); + + /* Create context on the device. */ + Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); + if (Context == 0) { + fprintf(stdout, "Allocate memory for Polly GPU context failed.\n"); + exit(-1); + } + Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext)); + if (Context->Context == 0) { + fprintf(stdout, "Allocate memory for Polly OpenCL context failed.\n"); + exit(-1); + } + ((OpenCLContext *)Context->Context)->Context = + clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to create context.\n"); + cl_printError(Ret); + exit(-1); + } + + static const int ExtraProperties = 0; + ((OpenCLContext *)Context->Context)->CommandQueue = + clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context, + DeviceID, ExtraProperties, &Ret); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to create command queue.\n"); + cl_printError(Ret); + exit(-1); + } + + if (CacheMode) + CurrentContext = Context; + + GlobalContext = Context; + return Context; +} + +static void freeKernelCL(PollyGPUFunction *Kernel) { + dump_function(); + + if (CacheMode) + return; + + if (!GlobalContext) { + fprintf(stdout, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + cl_int Ret; + Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to flush command queue.\n"); + cl_printError(Ret); + exit(-1); + } + Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to flush command queue.\n"); + cl_printError(Ret); + exit(-1); + } + + if (((OpenCLKernel *)Kernel->Kernel)->Kernel) { + cl_int Ret = + clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to release kernel.\n"); + cl_printError(Ret); + exit(-1); + } + } + + if (((OpenCLKernel *)Kernel->Kernel)->Program) { + cl_int Ret = + clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to release program.\n"); + cl_printError(Ret); + exit(-1); + } + } + + if (Kernel->Kernel) + free((OpenCLKernel *)Kernel->Kernel); + + if (Kernel) + free(Kernel); +} + +PollyGPUFunction *getKernelCL(const char *BinaryBuffer, + const char *KernelName) { + dump_function(); + + if (!GlobalContext) { + fprintf(stdout, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; + static __thread int NextCacheItem = 0; + + for (long i = 0; i < KERNEL_CACHE_SIZE; i++) { + // We exploit here the property that all Polly-ACC kernels are allocated + // as global constants, hence a pointer comparision is sufficient to + // determin equality. + if (KernelCache[i] && + ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString == + BinaryBuffer) { + debug_print(" -> using cached kernel\n"); + return KernelCache[i]; + } + } + + PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); + if (Function == 0) { + fprintf(stdout, "Allocate memory for Polly GPU function failed.\n"); + exit(-1); + } + Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel)); + if (Function->Kernel == 0) { + fprintf(stdout, "Allocate memory for Polly OpenCL kernel failed.\n"); + exit(-1); + } + + if (!GlobalDeviceID) { + fprintf(stdout, "GPGPU-code generation not initialized correctly.\n"); + exit(-1); + } + + cl_int Ret; + size_t BinarySize = strlen(BinaryBuffer); + ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID, + (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL, + &Ret); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to create program from binary.\n"); + cl_printError(Ret); + exit(-1); + } + + Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1, + &GlobalDeviceID, NULL, NULL, NULL); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to build program.\n"); + cl_printError(Ret); + exit(-1); + } + + ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr( + ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to create kernel.\n"); + cl_printError(Ret); + exit(-1); + } + + ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer; + + if (CacheMode) { + if (KernelCache[NextCacheItem]) + freeKernelCL(KernelCache[NextCacheItem]); + + KernelCache[NextCacheItem] = Function; + + NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE; + } + + return Function; +} + +void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { + dump_function(); + + if (!GlobalContext) { + fprintf(stdout, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + cl_int Ret; + Ret = clEnqueueWriteBufferFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->CommandQueue, + ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize, + HostData, 0, NULL, NULL); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Copying data from host memory to device failed.\n"); + cl_printError(Ret); + exit(-1); + } +} + +void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { + dump_function(); + + if (!GlobalContext) { + fprintf(stdout, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + cl_int Ret; + Ret = clEnqueueReadBufferFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->CommandQueue, + ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize, + HostData, 0, NULL, NULL); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Copying results from device to host memory failed.\n"); + cl_printError(Ret); + exit(-1); + } +} + +void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { + dump_function(); + + cl_int Ret; + cl_uint NumArgs; + + if (!GlobalContext) { + fprintf(stdout, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel; + Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS, + sizeof(cl_uint), &NumArgs, NULL); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to get number of kernel arguments.\n"); + cl_printError(Ret); + exit(-1); + } + + for (cl_uint i = 0; i < NumArgs; i++) { + Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]); + if (Ret == CL_INVALID_ARG_SIZE) { + Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]); + if (Ret == CL_INVALID_ARG_SIZE) { + Ret = + clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]); + if (Ret == CL_INVALID_ARG_SIZE) { + Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1, + (void *)Parameters[i]); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to set Kernel argument %d.\n", i); + cl_printError(Ret); + exit(-1); + } + } + } + } + if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) { + fprintf(stdout, "Failed to set Kernel argument.\n"); + cl_printError(Ret); + exit(-1); + } + } + + unsigned int GridDimZ = 1; + size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY, + BlockDimZ * GridDimZ}; + size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ}; + + static const int WorkDim = 3; + OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context; + Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel, + WorkDim, NULL, GlobalWorkSize, + LocalWorkSize, 0, NULL, NULL); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Launching OpenCL kernel failed.\n"); + cl_printError(Ret); + exit(-1); + } +} + +void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) { + dump_function(); + + OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; + cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Failed to free device memory.\n"); + cl_printError(Ret); + exit(-1); + } + + free(DevPtr); + free(Allocation); +} + +PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) { + dump_function(); + + if (!GlobalContext) { + fprintf(stdout, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); + if (DevData == 0) { + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } + DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr)); + if (DevData->DevicePtr == 0) { + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } + + cl_int Ret; + ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj = + clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context, + CL_MEM_READ_WRITE, MemSize, NULL, &Ret); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + cl_printError(Ret); + exit(-1); + } + + return DevData; +} + +void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) { + dump_function(); + + OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; + return (void *)DevPtr->MemObj; +} + +void freeContextCL(PollyGPUContext *Context) { + dump_function(); + + cl_int Ret; + + GlobalContext = NULL; + + OpenCLContext *Ctx = (OpenCLContext *)Context->Context; + if (Ctx->CommandQueue) { + Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Could not release command queue.\n"); + cl_printError(Ret); + exit(-1); + } + } + + if (Ctx->Context) { + Ret = clReleaseContextFcnPtr(Ctx->Context); + if (Ret != CL_SUCCESS) { + fprintf(stdout, "Could not release context.\n"); + cl_printError(Ret); + exit(-1); + } + } + + free(Ctx); + free(Context); +} + +void cl_printError(int Error) { + + switch (Error) { + case CL_SUCCESS: + // Success, don't print an error. + break; + + // JIT/Runtime errors. + case CL_DEVICE_NOT_FOUND: + fprintf(stdout, "Device not found.\n"); + break; + case CL_DEVICE_NOT_AVAILABLE: + fprintf(stdout, "Device not available.\n"); + break; + case CL_COMPILER_NOT_AVAILABLE: + fprintf(stdout, "Compiler not available.\n"); + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + fprintf(stdout, "Mem object allocation failure.\n"); + break; + case CL_OUT_OF_RESOURCES: + fprintf(stdout, "Out of resources.\n"); + break; + case CL_OUT_OF_HOST_MEMORY: + fprintf(stdout, "Out of host memory.\n"); + break; + case CL_PROFILING_INFO_NOT_AVAILABLE: + fprintf(stdout, "Profiling info not available.\n"); + break; + case CL_MEM_COPY_OVERLAP: + fprintf(stdout, "Mem copy overlap.\n"); + break; + case CL_IMAGE_FORMAT_MISMATCH: + fprintf(stdout, "Image format mismatch.\n"); + break; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: + fprintf(stdout, "Image format not supported.\n"); + break; + case CL_BUILD_PROGRAM_FAILURE: + fprintf(stdout, "Build program failure.\n"); + break; + case CL_MAP_FAILURE: + fprintf(stdout, "Map failure.\n"); + break; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: + fprintf(stdout, "Misaligned sub buffer offset.\n"); + break; + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: + fprintf(stdout, "Exec status error for events in wait list.\n"); + break; + case CL_COMPILE_PROGRAM_FAILURE: + fprintf(stdout, "Compile program failure.\n"); + break; + case CL_LINKER_NOT_AVAILABLE: + fprintf(stdout, "Linker not available.\n"); + break; + case CL_LINK_PROGRAM_FAILURE: + fprintf(stdout, "Link program failure.\n"); + break; + case CL_DEVICE_PARTITION_FAILED: + fprintf(stdout, "Device partition failed.\n"); + break; + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: + fprintf(stdout, "Kernel arg info not available.\n"); + break; + + // Compiler errors. + case CL_INVALID_VALUE: + fprintf(stdout, "Invalid value.\n"); + break; + case CL_INVALID_DEVICE_TYPE: + fprintf(stdout, "Invalid device type.\n"); + break; + case CL_INVALID_PLATFORM: + fprintf(stdout, "Invalid platform.\n"); + break; + case CL_INVALID_DEVICE: + fprintf(stdout, "Invalid device.\n"); + break; + case CL_INVALID_CONTEXT: + fprintf(stdout, "Invalid context.\n"); + break; + case CL_INVALID_QUEUE_PROPERTIES: + fprintf(stdout, "Invalid queue properties.\n"); + break; + case CL_INVALID_COMMAND_QUEUE: + fprintf(stdout, "Invalid command queue.\n"); + break; + case CL_INVALID_HOST_PTR: + fprintf(stdout, "Invalid host pointer.\n"); + break; + case CL_INVALID_MEM_OBJECT: + fprintf(stdout, "Invalid memory object.\n"); + break; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: + fprintf(stdout, "Invalid image format descriptor.\n"); + break; + case CL_INVALID_IMAGE_SIZE: + fprintf(stdout, "Invalid image size.\n"); + break; + case CL_INVALID_SAMPLER: + fprintf(stdout, "Invalid sampler.\n"); + break; + case CL_INVALID_BINARY: + fprintf(stdout, "Invalid binary.\n"); + break; + case CL_INVALID_BUILD_OPTIONS: + fprintf(stdout, "Invalid build options.\n"); + break; + case CL_INVALID_PROGRAM: + fprintf(stdout, "Invalid program.\n"); + break; + case CL_INVALID_PROGRAM_EXECUTABLE: + fprintf(stdout, "Invalid program executable.\n"); + break; + case CL_INVALID_KERNEL_NAME: + fprintf(stdout, "Invalid kernel name.\n"); + break; + case CL_INVALID_KERNEL_DEFINITION: + fprintf(stdout, "Invalid kernel definition.\n"); + break; + case CL_INVALID_KERNEL: + fprintf(stdout, "Invalid kernel.\n"); + break; + case CL_INVALID_ARG_INDEX: + fprintf(stdout, "Invalid arg index.\n"); + break; + case CL_INVALID_ARG_VALUE: + fprintf(stdout, "Invalid arg value.\n"); + break; + case CL_INVALID_ARG_SIZE: + fprintf(stdout, "Invalid arg size.\n"); + break; + case CL_INVALID_KERNEL_ARGS: + fprintf(stdout, "Invalid kernel args.\n"); + break; + case CL_INVALID_WORK_DIMENSION: + fprintf(stdout, "Invalid work dimension.\n"); + break; + case CL_INVALID_WORK_GROUP_SIZE: + fprintf(stdout, "Invalid work group size.\n"); + break; + case CL_INVALID_WORK_ITEM_SIZE: + fprintf(stdout, "Invalid work item size.\n"); + break; + case CL_INVALID_GLOBAL_OFFSET: + fprintf(stdout, "Invalid global offset.\n"); + break; + case CL_INVALID_EVENT_WAIT_LIST: + fprintf(stdout, "Invalid event wait list.\n"); + break; + case CL_INVALID_EVENT: + fprintf(stdout, "Invalid event.\n"); + break; + case CL_INVALID_OPERATION: + fprintf(stdout, "Invalid operation.\n"); + break; + case CL_INVALID_GL_OBJECT: + fprintf(stdout, "Invalid GL object.\n"); + break; + case CL_INVALID_BUFFER_SIZE: + fprintf(stdout, "Invalid buffer size.\n"); + break; + case CL_INVALID_MIP_LEVEL: + fprintf(stdout, "Invalid mip level.\n"); + break; + case CL_INVALID_GLOBAL_WORK_SIZE: + fprintf(stdout, "Invalid global work size.\n"); + break; + case CL_INVALID_PROPERTY: + fprintf(stdout, "Invalid property.\n"); + break; + case CL_INVALID_IMAGE_DESCRIPTOR: + fprintf(stdout, "Invalid image descriptor.\n"); + break; + case CL_INVALID_COMPILER_OPTIONS: + fprintf(stdout, "Invalid compiler options.\n"); + break; + case CL_INVALID_LINKER_OPTIONS: + fprintf(stdout, "Invalid linker options.\n"); + break; + case CL_INVALID_DEVICE_PARTITION_COUNT: + fprintf(stdout, "Invalid device partition count.\n"); + break; + case CL_INVALID_PIPE_SIZE: + fprintf(stdout, "Invalid pipe size.\n"); + break; + case CL_INVALID_DEVICE_QUEUE: + fprintf(stdout, "Invalid device queue.\n"); + break; + + // NVIDIA specific error. + case -9999: + fprintf(stdout, "NVIDIA invalid read or write buffer.\n"); + break; + + default: + fprintf(stdout, "Unknown error code!\n"); + break; + } +} + +/******************************************************************************/ +/* CUDA */ +/******************************************************************************/ + +struct CUDAContextT { + CUcontext Cuda; +}; + +struct CUDAKernelT { CUfunction Cuda; CUmodule CudaModule; - const char *PTXString; + const char *BinaryString; }; -struct PollyGPUDevicePtrT { +struct CUDADevicePtrT { CUdeviceptr Cuda; }; @@ -57,10 +955,10 @@ static CuMemAllocFcnTy *CuMemAllocFcnPtr; typedef CUresult CUDAAPI CuLaunchKernelFcnTy( - CUfunction f, unsigned int gridDimX, unsigned int gridDimY, - unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, - unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, - void **kernelParams, void **extra); + CUfunction F, unsigned int GridDimX, unsigned int GridDimY, + unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY, + unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream, + void **KernelParams, void **Extra); static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr; typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t); @@ -95,8 +993,8 @@ void **); static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr; -typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module, - const void *image); +typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module, + const void *Image); static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr; typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule, @@ -109,32 +1007,32 @@ typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice); static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr; -typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state, - CUjitInputType type, void *data, - size_t size, const char *name, - unsigned int numOptions, - CUjit_option *options, - void **optionValues); +typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State, + CUjitInputType Type, void *Data, + size_t Size, const char *Name, + unsigned int NumOptions, + CUjit_option *Options, + void **OptionValues); static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr; -typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions, - CUjit_option *options, - void **optionValues, - CUlinkState *stateOut); +typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions, + CUjit_option *Options, + void **OptionValues, + CUlinkState *StateOut); static CuLinkCreateFcnTy *CuLinkCreateFcnPtr; -typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut, - size_t *sizeOut); +typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut, + size_t *SizeOut); static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr; -typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state); +typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State); static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr; /* Type-defines of function pointer ot CUDA runtime APIs. */ typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; -static void *getAPIHandle(void *Handle, const char *FuncName) { +static void *getAPIHandleCUDA(void *Handle, const char *FuncName) { char *Err; void *FuncPtr; dlerror(); @@ -146,7 +1044,7 @@ return FuncPtr; } -static int initialDeviceAPILibraries() { +static int initialDeviceAPILibrariesCUDA() { HandleCuda = dlopen("libcuda.so", RTLD_LAZY); if (!HandleCuda) { printf("Cannot open library: %s. \n", dlerror()); @@ -162,8 +1060,8 @@ return 1; } -static int initialDeviceAPIs() { - if (initialDeviceAPILibraries() == 0) +static int initialDeviceAPIsCUDA() { + if (initialDeviceAPILibrariesCUDA() == 0) return 0; /* Get function pointer to CUDA Driver APIs. @@ -175,75 +1073,75 @@ * as it is valid on POSIX 2008. */ CuLaunchKernelFcnPtr = - (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel"); + (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel"); CuMemAllocFcnPtr = - (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2"); + (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2"); - CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2"); + CuMemFreeFcnPtr = + (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2"); CuMemcpyDtoHFcnPtr = - (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2"); + (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2"); CuMemcpyHtoDFcnPtr = - (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2"); + (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2"); CuModuleUnloadFcnPtr = - (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload"); + (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload"); CuCtxDestroyFcnPtr = - (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy"); + (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy"); - CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit"); + CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit"); CuDeviceGetCountFcnPtr = - (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount"); + (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount"); CuDeviceGetFcnPtr = - (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet"); + (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet"); CuCtxCreateFcnPtr = - (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2"); + (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2"); - CuModuleLoadDataExFcnPtr = - (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx"); + CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA( + HandleCuda, "cuModuleLoadDataEx"); CuModuleLoadDataFcnPtr = - (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData"); + (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData"); - CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle( + CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA( HandleCuda, "cuModuleGetFunction"); CuDeviceComputeCapabilityFcnPtr = - (CuDeviceComputeCapabilityFcnTy *)getAPIHandle( + (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA( HandleCuda, "cuDeviceComputeCapability"); CuDeviceGetNameFcnPtr = - (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName"); + (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName"); CuLinkAddDataFcnPtr = - (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData"); + (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData"); CuLinkCreateFcnPtr = - (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate"); + (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate"); CuLinkCompleteFcnPtr = - (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete"); + (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete"); CuLinkDestroyFcnPtr = - (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy"); + (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy"); /* Get function pointer to CUDA Runtime APIs. */ - CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle( + CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA( HandleCudaRT, "cudaThreadSynchronize"); return 1; } -PollyGPUContext *polly_initContext() { - DebugMode = getenv("POLLY_DEBUG") != 0; - +PollyGPUContext *initContextCUDA() { dump_function(); + PollyGPUContext *Context; CUdevice Device; @@ -257,7 +1155,7 @@ return CurrentContext; /* Get API handles. */ - if (initialDeviceAPIs() == 0) { + if (initialDeviceAPIsCUDA() == 0) { fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n"); exit(-1); } @@ -287,9 +1185,12 @@ fprintf(stdout, "Allocate memory for Polly GPU context failed.\n"); exit(-1); } - CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device); - - CacheMode = getenv("POLLY_NOCACHE") == 0; + Context->Context = malloc(sizeof(CUDAContext)); + if (Context->Context == 0) { + fprintf(stdout, "Allocate memory for Polly CUDA context failed.\n"); + exit(-1); + } + CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, Device); if (CacheMode) CurrentContext = Context; @@ -297,18 +1198,24 @@ return Context; } -static void freeKernel(PollyGPUFunction *Kernel) { - if (Kernel->CudaModule) - CuModuleUnloadFcnPtr(Kernel->CudaModule); +static void freeKernelCUDA(PollyGPUFunction *Kernel) { + dump_function(); + + if (CacheMode) + return; + + if (((CUDAKernel *)Kernel->Kernel)->CudaModule) + CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule); + + if (Kernel->Kernel) + free((CUDAKernel *)Kernel->Kernel); if (Kernel) free(Kernel); } -#define KERNEL_CACHE_SIZE 10 - -PollyGPUFunction *polly_getKernel(const char *PTXBuffer, - const char *KernelName) { +PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer, + const char *KernelName) { dump_function(); static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; @@ -318,18 +1225,23 @@ // We exploit here the property that all Polly-ACC kernels are allocated // as global constants, hence a pointer comparision is sufficient to // determin equality. - if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) { + if (KernelCache[i] && + ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) { debug_print(" -> using cached kernel\n"); return KernelCache[i]; } } PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); - if (Function == 0) { fprintf(stdout, "Allocate memory for Polly GPU function failed.\n"); exit(-1); } + Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel)); + if (Function->Kernel == 0) { + fprintf(stdout, "Allocate memory for Polly CUDA function failed.\n"); + exit(-1); + } CUresult Res; CUlinkState LState; @@ -364,8 +1276,8 @@ memset(ErrorLog, 0, sizeof(ErrorLog)); CuLinkCreateFcnPtr(6, Options, OptionVals, &LState); - Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer, - strlen(PTXBuffer) + 1, 0, 0, 0, 0); + Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer, + strlen(BinaryBuffer) + 1, 0, 0, 0, 0); if (Res != CUDA_SUCCESS) { fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog); exit(-1); @@ -381,13 +1293,15 @@ debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime, InfoLog); - Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut); + Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule), + CuOut); if (Res != CUDA_SUCCESS) { fprintf(stdout, "Loading ptx assembly text failed.\n"); exit(-1); } - Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule, + Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda), + ((CUDAKernel *)Function->Kernel)->CudaModule, KernelName); if (Res != CUDA_SUCCESS) { fprintf(stdout, "Loading kernel function failed.\n"); @@ -396,11 +1310,11 @@ CuLinkDestroyFcnPtr(LState); - Function->PTXString = PTXBuffer; + ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer; if (CacheMode) { if (KernelCache[NextCacheItem]) - freeKernel(KernelCache[NextCacheItem]); + freeKernelCUDA(KernelCache[NextCacheItem]); KernelCache[NextCacheItem] = Function; @@ -410,37 +1324,29 @@ return Function; } -void polly_freeKernel(PollyGPUFunction *Kernel) { +void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { dump_function(); - if (CacheMode) - return; - - freeKernel(Kernel); -} - -void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, - long MemSize) { - dump_function(); - - CUdeviceptr CuDevData = DevData->Cuda; + CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda; CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize); } -void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, - long MemSize) { +void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { dump_function(); - if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) { + if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda, + MemSize) != CUDA_SUCCESS) { fprintf(stdout, "Copying results from device to host memory failed.\n"); exit(-1); } } -void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, - unsigned int GridDimY, unsigned int BlockDimX, - unsigned int BlockDimY, unsigned int BlockDimZ, - void **Parameters) { +void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { dump_function(); unsigned GridDimZ = 1; @@ -449,32 +1355,40 @@ void **Extra = 0; CUresult Res; - Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ, - BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes, - Stream, Parameters, Extra); + Res = + CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX, + GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ, + SharedMemBytes, Stream, Parameters, Extra); if (Res != CUDA_SUCCESS) { fprintf(stdout, "Launching CUDA kernel failed.\n"); exit(-1); } } -void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { +void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) { dump_function(); - CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda); + CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; + CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda); + free(DevPtr); free(Allocation); } -PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { +PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) { dump_function(); PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); - if (DevData == 0) { fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); exit(-1); } + DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr)); + if (DevData->DevicePtr == 0) { + fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } - CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize); + CUresult Res = + CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize); if (Res != CUDA_SUCCESS) { fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); @@ -484,10 +1398,191 @@ return DevData; } +void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) { + dump_function(); + + CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; + return (void *)DevPtr->Cuda; +} + +void freeContextCUDA(PollyGPUContext *Context) { + dump_function(); + + CUDAContext *Ctx = (CUDAContext *)Context->Context; + if (Ctx->Cuda) { + CuCtxDestroyFcnPtr(Ctx->Cuda); + free(Ctx); + free(Context); + } + + dlclose(HandleCuda); + dlclose(HandleCudaRT); +} + +/******************************************************************************/ +/* API */ +/******************************************************************************/ + +PollyGPUContext *polly_initContext() { + DebugMode = getenv("POLLY_DEBUG") != 0; + CacheMode = getenv("POLLY_NOCACHE") == 0; + + dump_function(); + + PollyGPUContext *Context; + + switch (Runtime) { + case RUNTIME_CUDA: + Context = initContextCUDA(); + break; + case RUNTIME_CL: + Context = initContextCL(); + break; + default: + err_runtime(); + } + + return Context; +} + +void polly_freeKernel(PollyGPUFunction *Kernel) { + dump_function(); + + switch (Runtime) { + case RUNTIME_CUDA: + freeKernelCUDA(Kernel); + break; + case RUNTIME_CL: + freeKernelCL(Kernel); + break; + default: + err_runtime(); + } +} + +PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, + const char *KernelName) { + dump_function(); + + PollyGPUFunction *Function; + + switch (Runtime) { + case RUNTIME_CUDA: + Function = getKernelCUDA(BinaryBuffer, KernelName); + break; + case RUNTIME_CL: + Function = getKernelCL(BinaryBuffer, KernelName); + break; + default: + err_runtime(); + } + + return Function; +} + +void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { + dump_function(); + + switch (Runtime) { + case RUNTIME_CUDA: + copyFromHostToDeviceCUDA(HostData, DevData, MemSize); + break; + case RUNTIME_CL: + copyFromHostToDeviceCL(HostData, DevData, MemSize); + break; + default: + err_runtime(); + } +} + +void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { + dump_function(); + + switch (Runtime) { + case RUNTIME_CUDA: + copyFromDeviceToHostCUDA(DevData, HostData, MemSize); + break; + case RUNTIME_CL: + copyFromDeviceToHostCL(DevData, HostData, MemSize); + break; + default: + err_runtime(); + } +} + +void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { + dump_function(); + + switch (Runtime) { + case RUNTIME_CUDA: + launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, + BlockDimZ, Parameters); + break; + case RUNTIME_CL: + launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ, + Parameters); + break; + default: + err_runtime(); + } +} + +void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { + dump_function(); + + switch (Runtime) { + case RUNTIME_CUDA: + freeDeviceMemoryCUDA(Allocation); + break; + case RUNTIME_CL: + freeDeviceMemoryCL(Allocation); + break; + default: + err_runtime(); + } +} + +PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { + dump_function(); + + PollyGPUDevicePtr *DevData; + + switch (Runtime) { + case RUNTIME_CUDA: + DevData = allocateMemoryForDeviceCUDA(MemSize); + break; + case RUNTIME_CL: + DevData = allocateMemoryForDeviceCL(MemSize); + break; + default: + err_runtime(); + } + + return DevData; +} + void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) { dump_function(); - return (void *)Allocation->Cuda; + void *DevPtr; + + switch (Runtime) { + case RUNTIME_CUDA: + DevPtr = getDevicePtrCUDA(Allocation); + break; + case RUNTIME_CL: + DevPtr = getDevicePtrCL(Allocation); + break; + default: + err_runtime(); + } + + return DevPtr; } void polly_freeContext(PollyGPUContext *Context) { @@ -496,11 +1591,26 @@ if (CacheMode) return; - if (Context->Cuda) { - CuCtxDestroyFcnPtr(Context->Cuda); - free(Context); + switch (Runtime) { + case RUNTIME_CUDA: + freeContextCUDA(Context); + break; + case RUNTIME_CL: + freeContextCL(Context); + break; + default: + err_runtime(); } +} - dlclose(HandleCuda); - dlclose(HandleCudaRT); +/* Initialize GPUJIT with CUDA as runtime library. */ +PollyGPUContext *polly_initContextCUDA() { + Runtime = RUNTIME_CUDA; + return polly_initContext(); +} + +/* Initialize GPUJIT with OpenCL as runtime library. */ +PollyGPUContext *polly_initContextCL() { + Runtime = RUNTIME_CL; + return polly_initContext(); }