Index: CMakeLists.txt =================================================================== --- CMakeLists.txt +++ CMakeLists.txt @@ -152,9 +152,10 @@ option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF) if (POLLY_ENABLE_GPGPU_CODEGEN) - # Do not require CUDA, as GPU code generation test cases can be run without - # a cuda library. + # Do not require CUDA/OpenCL, as GPU code generation test cases can be run + # without a CUDA/OpenCL library. FIND_PACKAGE(CUDA) + FIND_PACKAGE(OpenCL) set(GPU_CODEGEN TRUE) else(POLLY_ENABLE_GPGPU_CODEGEN) set(GPU_CODEGEN FALSE) @@ -163,8 +164,13 @@ # Support GPGPU code generation if the library is available. if (CUDALIB_FOUND) + add_definitions(-DHAS_LIBCUDART) INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} ) endif(CUDALIB_FOUND) +if (OpenCL_FOUND) + add_definitions(-DHAS_LIBOPENCL) + INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} ) +endif(OpenCL_FOUND) option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON) if (NOT POLLY_BUNDLED_ISL) Index: include/polly/CodeGen/PPCGCodeGeneration.h =================================================================== --- /dev/null +++ include/polly/CodeGen/PPCGCodeGeneration.h @@ -0,0 +1,24 @@ +//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Take a scop created by ScopInfo and map it to GPU code using the ppcg +// GPU mapping strategy. +// +//===----------------------------------------------------------------------===// + +#ifndef POLLY_PPCGCODEGENERATION_H +#define POLLY_PPCGCODEGENERATION_H + +/// The GPU Architecture to target. +enum GPUArch { NVPTX64 }; + +/// The GPU Runtime implementation to use. +enum GPURuntime { CUDA, OpenCL }; + +#endif // POLLY_PPCGCODEGENERATION_H Index: include/polly/LinkAllPasses.h =================================================================== --- include/polly/LinkAllPasses.h +++ include/polly/LinkAllPasses.h @@ -15,6 +15,7 @@ #ifndef POLLY_LINKALLPASSES_H #define POLLY_LINKALLPASSES_H +#include "polly/CodeGen/PPCGCodeGeneration.h" #include "polly/Config/config.h" #include "polly/PruneUnprofitable.h" #include "polly/Simplify.h" @@ -48,7 +49,8 @@ llvm::Pass *createIslAstInfoPass(); llvm::Pass *createCodeGenerationPass(); #ifdef GPU_CODEGEN -llvm::Pass *createPPCGCodeGenerationPass(); +llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64, + GPURuntime Runtime = GPURuntime::CUDA); #endif llvm::Pass *createIslScheduleOptimizerPass(); llvm::Pass *createFlattenSchedulePass(); Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -12,6 +12,7 @@ // //===----------------------------------------------------------------------===// +#include "polly/CodeGen/PPCGCodeGeneration.h" #include "polly/CodeGen/IslAst.h" #include "polly/CodeGen/IslNodeBuilder.h" #include "polly/CodeGen/Utils.h" @@ -153,9 +154,9 @@ GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, Scop &S, BasicBlock *StartBlock, - gpu_prog *Prog) + gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch) : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), - Prog(Prog) { + Prog(Prog), Runtime(Runtime), Arch(Arch) { getExprBuilder().setIDToSAI(&IDToSAI); } @@ -201,6 +202,12 @@ /// The GPU program we generate code for. gpu_prog *Prog; + /// The GPU Runtime implementation to use (OpenCL or CUDA). + GPURuntime Runtime; + + /// The GPU Architecture to target. + GPUArch Arch; + /// Class to free isl_ids. class IslIdDeleter { public: @@ -752,7 +759,17 @@ } Value *GPUNodeBuilder::createCallInitContext() { - const char *Name = "polly_initContext"; + const char *Name; + + switch (Runtime) { + case GPURuntime::CUDA: + Name = "polly_initContextCUDA"; + break; + case GPURuntime::OpenCL: + Name = "polly_initContextCL"; + break; + } + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -1028,7 +1045,15 @@ void GPUNodeBuilder::createKernelSync() { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); + + Function *Sync; + + switch (Arch) { + case GPUArch::NVPTX64: + Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); + break; + } + Builder.CreateCall(Sync, {}); } @@ -1434,7 +1459,12 @@ auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, GPUModule.get()); - FN->setCallingConv(CallingConv::PTX_Kernel); + + switch (Arch) { + case GPUArch::NVPTX64: + FN->setCallingConv(CallingConv::PTX_Kernel); + break; + } auto Arg = FN->arg_begin(); for (long i = 0; i < Kernel->n_array; i++) { @@ -1495,12 +1525,19 @@ } void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { - Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, - Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; + Intrinsic::ID IntrinsicsBID[2]; + Intrinsic::ID IntrinsicsTID[3]; + + switch (Arch) { + case GPUArch::NVPTX64: + IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; + IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; - Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, - Intrinsic::nvvm_read_ptx_sreg_tid_y, - Intrinsic::nvvm_read_ptx_sreg_tid_z}; + IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x; + IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y; + IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z; + break; + } auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { std::string Name = isl_id_get_name(Id); @@ -1649,11 +1686,18 @@ void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, SetVector &SubtreeValues) { - std::string Identifier = "kernel_" + std::to_string(Kernel->id); GPUModule.reset(new Module(Identifier, Builder.getContext())); - GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); - GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); + + switch (Arch) { + case GPUArch::NVPTX64: + if (Runtime == GPURuntime::CUDA) + GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); + else if (Runtime == GPURuntime::OpenCL) + GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); + GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); + break; + } Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); @@ -1674,7 +1718,21 @@ } std::string GPUNodeBuilder::createKernelASM() { - llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); + llvm::Triple GPUTriple; + + switch (Arch) { + case GPUArch::NVPTX64: + switch (Runtime) { + case GPURuntime::CUDA: + GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda")); + break; + case GPURuntime::OpenCL: + GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl")); + break; + } + break; + } + std::string ErrMsg; auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); @@ -1685,9 +1743,17 @@ TargetOptions Options; Options.UnsafeFPMath = FastMath; - std::unique_ptr TargetM( - GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", - Options, Optional())); + + std::string subtarget; + + switch (Arch) { + case GPUArch::NVPTX64: + subtarget = CudaVersion; + break; + } + + std::unique_ptr TargetM(GPUTarget->createTargetMachine( + GPUTriple.getTriple(), subtarget, "", Options, Optional())); SmallString<0> ASMString; raw_svector_ostream ASMStream(ASMString); @@ -1739,6 +1805,10 @@ public: static char ID; + GPURuntime Runtime = GPURuntime::CUDA; + + GPUArch Architecture = GPUArch::NVPTX64; + /// The scop that is currently processed. Scop *S; @@ -2522,7 +2592,7 @@ executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, - StartBlock, Prog); + StartBlock, Prog, Runtime, Architecture); // TODO: Handle LICM auto SplitBlock = StartBlock->getSinglePredecessor(); @@ -2610,7 +2680,12 @@ char PPCGCodeGeneration::ID = 1; -Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } +Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) { + PPCGCodeGeneration *generator = new PPCGCodeGeneration(); + generator->Runtime = Runtime; + generator->Architecture = Arch; + return generator; +} INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", "Polly - Apply PPCG translation to SCOP", false, false) Index: lib/Support/RegisterPasses.cpp =================================================================== --- lib/Support/RegisterPasses.cpp +++ lib/Support/RegisterPasses.cpp @@ -23,6 +23,7 @@ #include "polly/Canonicalization.h" #include "polly/CodeGen/CodeGeneration.h" #include "polly/CodeGen/CodegenCleanup.h" +#include "polly/CodeGen/PPCGCodeGeneration.h" #include "polly/DeLICM.h" #include "polly/DependenceInfo.h" #include "polly/FlattenSchedule.h" @@ -101,6 +102,23 @@ ), cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory)); +#ifdef GPU_CODEGEN +static cl::opt GPURuntimeChoice( + "polly-gpu-runtime", cl::desc("The GPU Runtime API to target"), + cl::values(clEnumValN(GPURuntime::CUDA, "libcudart", + "use the CUDA Runtime API"), + clEnumValN(GPURuntime::OpenCL, "libopencl", + "use the OpenCL Runtime API")), + cl::init(GPURuntime::CUDA), cl::ZeroOrMore, cl::cat(PollyCategory)); + +static cl::opt + GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"), + cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64", + "target NVIDIA 64-bit architecture")), + cl::init(GPUArch::NVPTX64), cl::ZeroOrMore, + cl::cat(PollyCategory)); +#endif + VectorizerChoice polly::PollyVectorizerChoice; static cl::opt Vectorizer( "polly-vectorizer", cl::desc("Select the vectorization strategy"), @@ -309,7 +327,8 @@ if (Target == TARGET_GPU) { #ifdef GPU_CODEGEN - PM.add(polly::createPPCGCodeGenerationPass()); + PM.add( + polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice)); #endif } else { switch (CodeGeneration) { Index: test/GPGPU/cuda-managed-memory-simple.ll =================================================================== --- test/GPGPU/cuda-managed-memory-simple.ll +++ test/GPGPU/cuda-managed-memory-simple.ll @@ -35,7 +35,7 @@ ; CHECK-NOT: polly_freeDeviceMemory ; CHECK-NOT: polly_allocateMemoryForDevice -; CHECK: %13 = call i8* @polly_initContext() +; CHECK: %13 = call i8* @polly_initContextCUDA() ; CHECK-NEXT: %14 = bitcast i32* %A to i8* ; CHECK-NEXT: %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0 @@ -46,7 +46,7 @@ ; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1 ; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8* ; CHECK-NEXT: store i8* %19, i8** %18 -; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) +; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) ; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) ; CHECK-NEXT: call void @polly_freeKernel(i8* %20) ; CHECK-NEXT: call void @polly_synchronizeDevice() Index: test/GPGPU/size-cast.ll =================================================================== --- test/GPGPU/size-cast.ll +++ test/GPGPU/size-cast.ll @@ -29,7 +29,7 @@ ; CODE-NEXT: if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1) ; CODE-NEXT: Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0); -; IR: call i8* @polly_initContext() +; IR: call i8* @polly_initContextCUDA() ; IR-NEXT: sext i32 %arg to i64 ; IR-NEXT: mul i64 ; IR-NEXT: @polly_allocateMemoryForDevice Index: tools/CMakeLists.txt =================================================================== --- tools/CMakeLists.txt +++ tools/CMakeLists.txt @@ -1,5 +1,5 @@ -if (CUDALIB_FOUND) +if (CUDALIB_FOUND OR OpenCL_FOUND) add_subdirectory(GPURuntime) -endif (CUDALIB_FOUND) +endif (CUDALIB_FOUND OR OpenCL_FOUND) set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE) Index: tools/GPURuntime/GPUJIT.h =================================================================== --- tools/GPURuntime/GPUJIT.h +++ tools/GPURuntime/GPUJIT.h @@ -76,12 +76,27 @@ * */ +typedef enum PollyGPURuntimeT { + RUNTIME_NONE, + RUNTIME_CUDA, + RUNTIME_CL +} PollyGPURuntime; + typedef struct PollyGPUContextT PollyGPUContext; typedef struct PollyGPUFunctionT PollyGPUFunction; typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr; -PollyGPUContext *polly_initContext(); -PollyGPUFunction *polly_getKernel(const char *PTXBuffer, +typedef struct OpenCLContextT OpenCLContext; +typedef struct OpenCLKernelT OpenCLKernel; +typedef struct OpenCLDevicePtrT OpenCLDevicePtr; + +typedef struct CUDAContextT CUDAContext; +typedef struct CUDAKernelT CUDAKernel; +typedef struct CUDADevicePtrT CUDADevicePtr; + +PollyGPUContext *polly_initContextCUDA(); +PollyGPUContext *polly_initContextCL(); +PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, const char *KernelName); void polly_freeKernel(PollyGPUFunction *Kernel); void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -12,8 +12,20 @@ /******************************************************************************/ #include "GPUJIT.h" + +#ifdef HAS_LIBCUDART #include #include +#endif /* HAS_LIBCUDART */ + +#ifdef HAS_LIBOPENCL +#ifdef __APPLE__ +#include +#else +#include +#endif +#endif /* HAS_LIBOPENCL */ + #include #include #include @@ -22,6 +34,8 @@ static int DebugMode; static int CacheMode; +static PollyGPURuntime Runtime = RUNTIME_NONE; + static void debug_print(const char *format, ...) { if (!DebugMode) return; @@ -33,18 +47,853 @@ } #define dump_function() debug_print("-> %s\n", __func__) -/* Define Polly's GPGPU data types. */ +#define KERNEL_CACHE_SIZE 10 + +static void err_runtime() { + fprintf(stderr, "Runtime not correctly initialized.\n"); + exit(-1); +} + struct PollyGPUContextT { - CUcontext Cuda; + void *Context; }; struct PollyGPUFunctionT { + void *Kernel; +}; + +struct PollyGPUDevicePtrT { + void *DevicePtr; +}; + +/******************************************************************************/ +/* OpenCL */ +/******************************************************************************/ +#ifdef HAS_LIBOPENCL + +struct OpenCLContextT { + cl_context Context; + cl_command_queue CommandQueue; +}; + +struct OpenCLKernelT { + cl_kernel Kernel; + cl_program Program; + const char *BinaryString; +}; + +struct OpenCLDevicePtrT { + cl_mem MemObj; +}; + +/* Dynamic library handles for the OpenCL runtime library. */ +static void *HandleOpenCL; + +/* Type-defines of function pointer to OpenCL Runtime API. */ +typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries, + cl_platform_id *Platforms, + cl_uint *NumPlatforms); +static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr; + +typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform, + cl_device_type DeviceType, + cl_uint NumEntries, cl_device_id *Devices, + cl_uint *NumDevices); +static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr; + +typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device, + cl_device_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet); +static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr; + +typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet); +static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr; + +typedef cl_context clCreateContextFcnTy( + const cl_context_properties *Properties, cl_uint NumDevices, + const cl_device_id *Devices, + void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo, + size_t CB, void *UserData), + void *UserData, cl_int *ErrcodeRet); +static clCreateContextFcnTy *clCreateContextFcnPtr; + +typedef cl_command_queue +clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device, + cl_command_queue_properties Properties, + cl_int *ErrcodeRet); +static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr; + +typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags, + size_t Size, void *HostPtr, + cl_int *ErrcodeRet); +static clCreateBufferFcnTy *clCreateBufferFcnPtr; + +typedef cl_int +clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer, + cl_bool BlockingWrite, size_t Offset, size_t Size, + const void *Ptr, cl_uint NumEventsInWaitList, + const cl_event *EventWaitList, cl_event *Event); +static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr; + +typedef cl_program clCreateProgramWithBinaryFcnTy( + cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList, + const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus, + cl_int *ErrcodeRet); +static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr; + +typedef cl_int clBuildProgramFcnTy( + cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList, + const char *Options, + void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData), + void *UserData); +static clBuildProgramFcnTy *clBuildProgramFcnPtr; + +typedef cl_kernel clCreateKernelFcnTy(cl_program Program, + const char *KernelName, + cl_int *ErrcodeRet); +static clCreateKernelFcnTy *clCreateKernelFcnPtr; + +typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex, + size_t ArgSize, const void *ArgValue); +static clSetKernelArgFcnTy *clSetKernelArgFcnPtr; + +typedef cl_int clEnqueueNDRangeKernelFcnTy( + cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, cl_uint NumEventsInWaitList, + const cl_event *EventWaitList, cl_event *Event); +static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr; + +typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue, + cl_mem Buffer, cl_bool BlockingRead, + size_t Offset, size_t Size, void *Ptr, + cl_uint NumEventsInWaitList, + const cl_event *EventWaitList, + cl_event *Event); +static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr; + +typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue); +static clFlushFcnTy *clFlushFcnPtr; + +typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue); +static clFinishFcnTy *clFinishFcnPtr; + +typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel); +static clReleaseKernelFcnTy *clReleaseKernelFcnPtr; + +typedef cl_int clReleaseProgramFcnTy(cl_program Program); +static clReleaseProgramFcnTy *clReleaseProgramFcnPtr; + +typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject); +static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr; + +typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue); +static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr; + +typedef cl_int clReleaseContextFcnTy(cl_context Context); +static clReleaseContextFcnTy *clReleaseContextFcnPtr; + +static void *getAPIHandleCL(void *Handle, const char *FuncName) { + char *Err; + void *FuncPtr; + dlerror(); + FuncPtr = dlsym(Handle, FuncName); + if ((Err = dlerror()) != 0) { + fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err); + return 0; + } + return FuncPtr; +} + +static int initialDeviceAPILibrariesCL() { + HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY); + if (!HandleOpenCL) { + fprintf(stderr, "Cannot open library: %s. \n", dlerror()); + return 0; + } + return 1; +} + +static int initialDeviceAPIsCL() { + if (initialDeviceAPILibrariesCL() == 0) + return 0; + + /* Get function pointer to OpenCL Runtime API. + * + * Note that compilers conforming to the ISO C standard are required to + * generate a warning if a conversion from a void * pointer to a function + * pointer is attempted as in the following statements. The warning + * of this kind of cast may not be emitted by clang and new versions of gcc + * as it is valid on POSIX 2008. + */ + clGetPlatformIDsFcnPtr = + (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs"); + + clGetDeviceIDsFcnPtr = + (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs"); + + clGetDeviceInfoFcnPtr = + (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo"); + + clGetKernelInfoFcnPtr = + (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo"); + + clCreateContextFcnPtr = + (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext"); + + clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL( + HandleOpenCL, "clCreateCommandQueue"); + + clCreateBufferFcnPtr = + (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer"); + + clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL( + HandleOpenCL, "clEnqueueWriteBuffer"); + + clCreateProgramWithBinaryFcnPtr = + (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL( + HandleOpenCL, "clCreateProgramWithBinary"); + + clBuildProgramFcnPtr = + (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram"); + + clCreateKernelFcnPtr = + (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel"); + + clSetKernelArgFcnPtr = + (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg"); + + clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL( + HandleOpenCL, "clEnqueueNDRangeKernel"); + + clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL( + HandleOpenCL, "clEnqueueReadBuffer"); + + clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush"); + + clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish"); + + clReleaseKernelFcnPtr = + (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel"); + + clReleaseProgramFcnPtr = + (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram"); + + clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL( + HandleOpenCL, "clReleaseMemObject"); + + clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL( + HandleOpenCL, "clReleaseCommandQueue"); + + clReleaseContextFcnPtr = + (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext"); + + return 1; +} + +/* Context and Device. */ +static PollyGPUContext *GlobalContext = NULL; +static cl_device_id GlobalDeviceID = NULL; + +/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */ +static void printOpenCLError(int Error); + +static void checkOpenCLError(int Ret, const char *format, ...) { + if (Ret == CL_SUCCESS) + return; + + printOpenCLError(Ret); + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + exit(-1); +} + +static PollyGPUContext *initContextCL() { + dump_function(); + + PollyGPUContext *Context; + + cl_platform_id PlatformID = NULL; + cl_device_id DeviceID = NULL; + cl_uint NumDevicesRet; + cl_int Ret; + + char DeviceRevision[256]; + char DeviceName[256]; + size_t DeviceRevisionRetSize, DeviceNameRetSize; + + static __thread PollyGPUContext *CurrentContext = NULL; + + if (CurrentContext) + return CurrentContext; + + /* Get API handles. */ + if (initialDeviceAPIsCL() == 0) { + fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n"); + exit(-1); + } + + /* Get number of devices that support OpenCL. */ + static const int NumberOfPlatforms = 1; + Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL); + checkOpenCLError(Ret, "Failed to get platform IDs.\n"); + // TODO: Extend to CL_DEVICE_TYPE_ALL? + static const int NumberOfDevices = 1; + Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices, + &DeviceID, &NumDevicesRet); + checkOpenCLError(Ret, "Failed to get device IDs.\n"); + + GlobalDeviceID = DeviceID; + if (NumDevicesRet == 0) { + fprintf(stderr, "There is no device supporting OpenCL.\n"); + exit(-1); + } + + /* Get device revision. */ + Ret = + clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision), + DeviceRevision, &DeviceRevisionRetSize); + checkOpenCLError(Ret, "Failed to fetch device revision.\n"); + + /* Get device name. */ + Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName), + DeviceName, &DeviceNameRetSize); + checkOpenCLError(Ret, "Failed to fetch device name.\n"); + + debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName); + + /* Create context on the device. */ + Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); + if (Context == 0) { + fprintf(stderr, "Allocate memory for Polly GPU context failed.\n"); + exit(-1); + } + Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext)); + if (Context->Context == 0) { + fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n"); + exit(-1); + } + ((OpenCLContext *)Context->Context)->Context = + clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret); + checkOpenCLError(Ret, "Failed to create context.\n"); + + static const int ExtraProperties = 0; + ((OpenCLContext *)Context->Context)->CommandQueue = + clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context, + DeviceID, ExtraProperties, &Ret); + checkOpenCLError(Ret, "Failed to create command queue.\n"); + + if (CacheMode) + CurrentContext = Context; + + GlobalContext = Context; + return Context; +} + +static void freeKernelCL(PollyGPUFunction *Kernel) { + dump_function(); + + if (CacheMode) + return; + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + cl_int Ret; + Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); + checkOpenCLError(Ret, "Failed to flush command queue.\n"); + Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue); + checkOpenCLError(Ret, "Failed to finish command queue.\n"); + + if (((OpenCLKernel *)Kernel->Kernel)->Kernel) { + cl_int Ret = + clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel); + checkOpenCLError(Ret, "Failed to release kernel.\n"); + } + + if (((OpenCLKernel *)Kernel->Kernel)->Program) { + cl_int Ret = + clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program); + checkOpenCLError(Ret, "Failed to release program.\n"); + } + + if (Kernel->Kernel) + free((OpenCLKernel *)Kernel->Kernel); + + if (Kernel) + free(Kernel); +} + +static PollyGPUFunction *getKernelCL(const char *BinaryBuffer, + const char *KernelName) { + dump_function(); + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; + static __thread int NextCacheItem = 0; + + for (long i = 0; i < KERNEL_CACHE_SIZE; i++) { + // We exploit here the property that all Polly-ACC kernels are allocated + // as global constants, hence a pointer comparision is sufficient to + // determin equality. + if (KernelCache[i] && + ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString == + BinaryBuffer) { + debug_print(" -> using cached kernel\n"); + return KernelCache[i]; + } + } + + PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); + if (Function == 0) { + fprintf(stderr, "Allocate memory for Polly GPU function failed.\n"); + exit(-1); + } + Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel)); + if (Function->Kernel == 0) { + fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n"); + exit(-1); + } + + if (!GlobalDeviceID) { + fprintf(stderr, "GPGPU-code generation not initialized correctly.\n"); + exit(-1); + } + + cl_int Ret; + size_t BinarySize = strlen(BinaryBuffer); + ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID, + (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL, + &Ret); + checkOpenCLError(Ret, "Failed to create program from binary.\n"); + + Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1, + &GlobalDeviceID, NULL, NULL, NULL); + checkOpenCLError(Ret, "Failed to build program.\n"); + + ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr( + ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret); + checkOpenCLError(Ret, "Failed to create kernel.\n"); + + ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer; + + if (CacheMode) { + if (KernelCache[NextCacheItem]) + freeKernelCL(KernelCache[NextCacheItem]); + + KernelCache[NextCacheItem] = Function; + + NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE; + } + + return Function; +} + +static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { + dump_function(); + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + cl_int Ret; + Ret = clEnqueueWriteBufferFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->CommandQueue, + ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize, + HostData, 0, NULL, NULL); + checkOpenCLError(Ret, "Copying data from host memory to device failed.\n"); +} + +static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { + dump_function(); + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + cl_int Ret; + Ret = clEnqueueReadBufferFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->CommandQueue, + ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize, + HostData, 0, NULL, NULL); + checkOpenCLError(Ret, "Copying results from device to host memory failed.\n"); +} + +static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { + dump_function(); + + cl_int Ret; + cl_uint NumArgs; + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel; + Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS, + sizeof(cl_uint), &NumArgs, NULL); + checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n"); + + // TODO: Pass the size of the kernel arguments in to launchKernelCL, along + // with the arguments themselves. This is a dirty workaround that can be + // broken. + for (cl_uint i = 0; i < NumArgs; i++) { + Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]); + if (Ret == CL_INVALID_ARG_SIZE) { + Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]); + if (Ret == CL_INVALID_ARG_SIZE) { + Ret = + clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]); + if (Ret == CL_INVALID_ARG_SIZE) { + Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1, + (void *)Parameters[i]); + checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i); + } + } + } + if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) { + fprintf(stderr, "Failed to set Kernel argument.\n"); + printOpenCLError(Ret); + exit(-1); + } + } + + unsigned int GridDimZ = 1; + size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY, + BlockDimZ * GridDimZ}; + size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ}; + + static const int WorkDim = 3; + OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context; + Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel, + WorkDim, NULL, GlobalWorkSize, + LocalWorkSize, 0, NULL, NULL); + checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n"); +} + +static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) { + dump_function(); + + OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; + cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj); + checkOpenCLError(Ret, "Failed to free device memory.\n"); + + free(DevPtr); + free(Allocation); +} + +static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) { + dump_function(); + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); + if (DevData == 0) { + fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } + DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr)); + if (DevData->DevicePtr == 0) { + fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } + + cl_int Ret; + ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj = + clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context, + CL_MEM_READ_WRITE, MemSize, NULL, &Ret); + checkOpenCLError(Ret, + "Allocate memory for GPU device memory pointer failed.\n"); + + return DevData; +} + +static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) { + dump_function(); + + OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr; + return (void *)DevPtr->MemObj; +} + +static void synchronizeDeviceCL() { + dump_function(); + + if (!GlobalContext) { + fprintf(stderr, "GPGPU-code generation not correctly initialized.\n"); + exit(-1); + } + + if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) != + CL_SUCCESS) { + fprintf(stderr, "Synchronizing device and host memory failed.\n"); + exit(-1); + } +} + +static void freeContextCL(PollyGPUContext *Context) { + dump_function(); + + cl_int Ret; + + GlobalContext = NULL; + + OpenCLContext *Ctx = (OpenCLContext *)Context->Context; + if (Ctx->CommandQueue) { + Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue); + checkOpenCLError(Ret, "Could not release command queue.\n"); + } + + if (Ctx->Context) { + Ret = clReleaseContextFcnPtr(Ctx->Context); + checkOpenCLError(Ret, "Could not release context.\n"); + } + + free(Ctx); + free(Context); +} + +static void printOpenCLError(int Error) { + + switch (Error) { + case CL_SUCCESS: + // Success, don't print an error. + break; + + // JIT/Runtime errors. + case CL_DEVICE_NOT_FOUND: + fprintf(stderr, "Device not found.\n"); + break; + case CL_DEVICE_NOT_AVAILABLE: + fprintf(stderr, "Device not available.\n"); + break; + case CL_COMPILER_NOT_AVAILABLE: + fprintf(stderr, "Compiler not available.\n"); + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + fprintf(stderr, "Mem object allocation failure.\n"); + break; + case CL_OUT_OF_RESOURCES: + fprintf(stderr, "Out of resources.\n"); + break; + case CL_OUT_OF_HOST_MEMORY: + fprintf(stderr, "Out of host memory.\n"); + break; + case CL_PROFILING_INFO_NOT_AVAILABLE: + fprintf(stderr, "Profiling info not available.\n"); + break; + case CL_MEM_COPY_OVERLAP: + fprintf(stderr, "Mem copy overlap.\n"); + break; + case CL_IMAGE_FORMAT_MISMATCH: + fprintf(stderr, "Image format mismatch.\n"); + break; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: + fprintf(stderr, "Image format not supported.\n"); + break; + case CL_BUILD_PROGRAM_FAILURE: + fprintf(stderr, "Build program failure.\n"); + break; + case CL_MAP_FAILURE: + fprintf(stderr, "Map failure.\n"); + break; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: + fprintf(stderr, "Misaligned sub buffer offset.\n"); + break; + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: + fprintf(stderr, "Exec status error for events in wait list.\n"); + break; + case CL_COMPILE_PROGRAM_FAILURE: + fprintf(stderr, "Compile program failure.\n"); + break; + case CL_LINKER_NOT_AVAILABLE: + fprintf(stderr, "Linker not available.\n"); + break; + case CL_LINK_PROGRAM_FAILURE: + fprintf(stderr, "Link program failure.\n"); + break; + case CL_DEVICE_PARTITION_FAILED: + fprintf(stderr, "Device partition failed.\n"); + break; + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: + fprintf(stderr, "Kernel arg info not available.\n"); + break; + + // Compiler errors. + case CL_INVALID_VALUE: + fprintf(stderr, "Invalid value.\n"); + break; + case CL_INVALID_DEVICE_TYPE: + fprintf(stderr, "Invalid device type.\n"); + break; + case CL_INVALID_PLATFORM: + fprintf(stderr, "Invalid platform.\n"); + break; + case CL_INVALID_DEVICE: + fprintf(stderr, "Invalid device.\n"); + break; + case CL_INVALID_CONTEXT: + fprintf(stderr, "Invalid context.\n"); + break; + case CL_INVALID_QUEUE_PROPERTIES: + fprintf(stderr, "Invalid queue properties.\n"); + break; + case CL_INVALID_COMMAND_QUEUE: + fprintf(stderr, "Invalid command queue.\n"); + break; + case CL_INVALID_HOST_PTR: + fprintf(stderr, "Invalid host pointer.\n"); + break; + case CL_INVALID_MEM_OBJECT: + fprintf(stderr, "Invalid memory object.\n"); + break; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: + fprintf(stderr, "Invalid image format descriptor.\n"); + break; + case CL_INVALID_IMAGE_SIZE: + fprintf(stderr, "Invalid image size.\n"); + break; + case CL_INVALID_SAMPLER: + fprintf(stderr, "Invalid sampler.\n"); + break; + case CL_INVALID_BINARY: + fprintf(stderr, "Invalid binary.\n"); + break; + case CL_INVALID_BUILD_OPTIONS: + fprintf(stderr, "Invalid build options.\n"); + break; + case CL_INVALID_PROGRAM: + fprintf(stderr, "Invalid program.\n"); + break; + case CL_INVALID_PROGRAM_EXECUTABLE: + fprintf(stderr, "Invalid program executable.\n"); + break; + case CL_INVALID_KERNEL_NAME: + fprintf(stderr, "Invalid kernel name.\n"); + break; + case CL_INVALID_KERNEL_DEFINITION: + fprintf(stderr, "Invalid kernel definition.\n"); + break; + case CL_INVALID_KERNEL: + fprintf(stderr, "Invalid kernel.\n"); + break; + case CL_INVALID_ARG_INDEX: + fprintf(stderr, "Invalid arg index.\n"); + break; + case CL_INVALID_ARG_VALUE: + fprintf(stderr, "Invalid arg value.\n"); + break; + case CL_INVALID_ARG_SIZE: + fprintf(stderr, "Invalid arg size.\n"); + break; + case CL_INVALID_KERNEL_ARGS: + fprintf(stderr, "Invalid kernel args.\n"); + break; + case CL_INVALID_WORK_DIMENSION: + fprintf(stderr, "Invalid work dimension.\n"); + break; + case CL_INVALID_WORK_GROUP_SIZE: + fprintf(stderr, "Invalid work group size.\n"); + break; + case CL_INVALID_WORK_ITEM_SIZE: + fprintf(stderr, "Invalid work item size.\n"); + break; + case CL_INVALID_GLOBAL_OFFSET: + fprintf(stderr, "Invalid global offset.\n"); + break; + case CL_INVALID_EVENT_WAIT_LIST: + fprintf(stderr, "Invalid event wait list.\n"); + break; + case CL_INVALID_EVENT: + fprintf(stderr, "Invalid event.\n"); + break; + case CL_INVALID_OPERATION: + fprintf(stderr, "Invalid operation.\n"); + break; + case CL_INVALID_GL_OBJECT: + fprintf(stderr, "Invalid GL object.\n"); + break; + case CL_INVALID_BUFFER_SIZE: + fprintf(stderr, "Invalid buffer size.\n"); + break; + case CL_INVALID_MIP_LEVEL: + fprintf(stderr, "Invalid mip level.\n"); + break; + case CL_INVALID_GLOBAL_WORK_SIZE: + fprintf(stderr, "Invalid global work size.\n"); + break; + case CL_INVALID_PROPERTY: + fprintf(stderr, "Invalid property.\n"); + break; + case CL_INVALID_IMAGE_DESCRIPTOR: + fprintf(stderr, "Invalid image descriptor.\n"); + break; + case CL_INVALID_COMPILER_OPTIONS: + fprintf(stderr, "Invalid compiler options.\n"); + break; + case CL_INVALID_LINKER_OPTIONS: + fprintf(stderr, "Invalid linker options.\n"); + break; + case CL_INVALID_DEVICE_PARTITION_COUNT: + fprintf(stderr, "Invalid device partition count.\n"); + break; + case CL_INVALID_PIPE_SIZE: + fprintf(stderr, "Invalid pipe size.\n"); + break; + case CL_INVALID_DEVICE_QUEUE: + fprintf(stderr, "Invalid device queue.\n"); + break; + + // NVIDIA specific error. + case -9999: + fprintf(stderr, "NVIDIA invalid read or write buffer.\n"); + break; + + default: + fprintf(stderr, "Unknown error code!\n"); + break; + } +} + +#endif /* HAS_LIBOPENCL */ +/******************************************************************************/ +/* CUDA */ +/******************************************************************************/ +#ifdef HAS_LIBCUDART + +struct CUDAContextT { + CUcontext Cuda; +}; + +struct CUDAKernelT { CUfunction Cuda; CUmodule CudaModule; - const char *PTXString; + const char *BinaryString; }; -struct PollyGPUDevicePtrT { +struct CUDADevicePtrT { CUdeviceptr Cuda; }; @@ -57,10 +906,10 @@ static CuMemAllocFcnTy *CuMemAllocFcnPtr; typedef CUresult CUDAAPI CuLaunchKernelFcnTy( - CUfunction f, unsigned int gridDimX, unsigned int gridDimY, - unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, - unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, - void **kernelParams, void **extra); + CUfunction F, unsigned int GridDimX, unsigned int GridDimY, + unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY, + unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream, + void **KernelParams, void **Extra); static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr; typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t); @@ -95,8 +944,8 @@ void **); static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr; -typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module, - const void *image); +typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module, + const void *Image); static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr; typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule, @@ -109,25 +958,25 @@ typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice); static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr; -typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state, - CUjitInputType type, void *data, - size_t size, const char *name, - unsigned int numOptions, - CUjit_option *options, - void **optionValues); +typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State, + CUjitInputType Type, void *Data, + size_t Size, const char *Name, + unsigned int NumOptions, + CUjit_option *Options, + void **OptionValues); static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr; -typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions, - CUjit_option *options, - void **optionValues, - CUlinkState *stateOut); +typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions, + CUjit_option *Options, + void **OptionValues, + CUlinkState *StateOut); static CuLinkCreateFcnTy *CuLinkCreateFcnPtr; -typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut, - size_t *sizeOut); +typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut, + size_t *SizeOut); static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr; -typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state); +typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State); static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr; typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy(); @@ -137,36 +986,36 @@ typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; -static void *getAPIHandle(void *Handle, const char *FuncName) { +static void *getAPIHandleCUDA(void *Handle, const char *FuncName) { char *Err; void *FuncPtr; dlerror(); FuncPtr = dlsym(Handle, FuncName); if ((Err = dlerror()) != 0) { - fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err); + fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err); return 0; } return FuncPtr; } -static int initialDeviceAPILibraries() { +static int initialDeviceAPILibrariesCUDA() { HandleCuda = dlopen("libcuda.so", RTLD_LAZY); if (!HandleCuda) { - printf("Cannot open library: %s. \n", dlerror()); + fprintf(stderr, "Cannot open library: %s. \n", dlerror()); return 0; } HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY); if (!HandleCudaRT) { - printf("Cannot open library: %s. \n", dlerror()); + fprintf(stderr, "Cannot open library: %s. \n", dlerror()); return 0; } return 1; } -static int initialDeviceAPIs() { - if (initialDeviceAPILibraries() == 0) +static int initialDeviceAPIsCUDA() { + if (initialDeviceAPILibrariesCUDA() == 0) return 0; /* Get function pointer to CUDA Driver APIs. @@ -178,77 +1027,76 @@ * as it is valid on POSIX 2008. */ CuLaunchKernelFcnPtr = - (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel"); + (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel"); CuMemAllocFcnPtr = - (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2"); + (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2"); - CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2"); + CuMemFreeFcnPtr = + (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2"); CuMemcpyDtoHFcnPtr = - (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2"); + (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2"); CuMemcpyHtoDFcnPtr = - (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2"); + (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2"); CuModuleUnloadFcnPtr = - (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload"); + (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload"); CuCtxDestroyFcnPtr = - (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy"); + (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy"); - CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit"); + CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit"); CuDeviceGetCountFcnPtr = - (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount"); + (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount"); CuDeviceGetFcnPtr = - (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet"); + (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet"); CuCtxCreateFcnPtr = - (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2"); + (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2"); - CuModuleLoadDataExFcnPtr = - (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx"); + CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA( + HandleCuda, "cuModuleLoadDataEx"); CuModuleLoadDataFcnPtr = - (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData"); + (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData"); - CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle( + CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA( HandleCuda, "cuModuleGetFunction"); CuDeviceComputeCapabilityFcnPtr = - (CuDeviceComputeCapabilityFcnTy *)getAPIHandle( + (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA( HandleCuda, "cuDeviceComputeCapability"); CuDeviceGetNameFcnPtr = - (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName"); + (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName"); CuLinkAddDataFcnPtr = - (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData"); + (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData"); CuLinkCreateFcnPtr = - (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate"); + (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate"); CuLinkCompleteFcnPtr = - (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete"); + (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete"); CuLinkDestroyFcnPtr = - (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy"); + (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy"); CuCtxSynchronizeFcnPtr = - (CuCtxSynchronizeFcnTy *)getAPIHandle(HandleCuda, "cuCtxSynchronize"); + (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize"); /* Get function pointer to CUDA Runtime APIs. */ - CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle( + CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA( HandleCudaRT, "cudaThreadSynchronize"); return 1; } -PollyGPUContext *polly_initContext() { - DebugMode = getenv("POLLY_DEBUG") != 0; - +static PollyGPUContext *initContextCUDA() { dump_function(); PollyGPUContext *Context; CUdevice Device; @@ -263,20 +1111,20 @@ return CurrentContext; /* Get API handles. */ - if (initialDeviceAPIs() == 0) { - fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n"); + if (initialDeviceAPIsCUDA() == 0) { + fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n"); exit(-1); } if (CuInitFcnPtr(0) != CUDA_SUCCESS) { - fprintf(stdout, "Initializing the CUDA driver API failed.\n"); + fprintf(stderr, "Initializing the CUDA driver API failed.\n"); exit(-1); } /* Get number of devices that supports CUDA. */ CuDeviceGetCountFcnPtr(&DeviceCount); if (DeviceCount == 0) { - fprintf(stdout, "There is no device supporting CUDA.\n"); + fprintf(stderr, "There is no device supporting CUDA.\n"); exit(-1); } @@ -290,12 +1138,15 @@ /* Create context on the device. */ Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); if (Context == 0) { - fprintf(stdout, "Allocate memory for Polly GPU context failed.\n"); + fprintf(stderr, "Allocate memory for Polly GPU context failed.\n"); exit(-1); } - CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device); - - CacheMode = getenv("POLLY_NOCACHE") == 0; + Context->Context = malloc(sizeof(CUDAContext)); + if (Context->Context == 0) { + fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n"); + exit(-1); + } + CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, Device); if (CacheMode) CurrentContext = Context; @@ -303,18 +1154,24 @@ return Context; } -static void freeKernel(PollyGPUFunction *Kernel) { - if (Kernel->CudaModule) - CuModuleUnloadFcnPtr(Kernel->CudaModule); +static void freeKernelCUDA(PollyGPUFunction *Kernel) { + dump_function(); + + if (CacheMode) + return; + + if (((CUDAKernel *)Kernel->Kernel)->CudaModule) + CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule); + + if (Kernel->Kernel) + free((CUDAKernel *)Kernel->Kernel); if (Kernel) free(Kernel); } -#define KERNEL_CACHE_SIZE 10 - -PollyGPUFunction *polly_getKernel(const char *PTXBuffer, - const char *KernelName) { +static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer, + const char *KernelName) { dump_function(); static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; @@ -324,16 +1181,21 @@ // We exploit here the property that all Polly-ACC kernels are allocated // as global constants, hence a pointer comparision is sufficient to // determin equality. - if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) { + if (KernelCache[i] && + ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) { debug_print(" -> using cached kernel\n"); return KernelCache[i]; } } PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); - if (Function == 0) { - fprintf(stdout, "Allocate memory for Polly GPU function failed.\n"); + fprintf(stderr, "Allocate memory for Polly GPU function failed.\n"); + exit(-1); + } + Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel)); + if (Function->Kernel == 0) { + fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n"); exit(-1); } @@ -370,43 +1232,45 @@ memset(ErrorLog, 0, sizeof(ErrorLog)); CuLinkCreateFcnPtr(6, Options, OptionVals, &LState); - Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer, - strlen(PTXBuffer) + 1, 0, 0, 0, 0); + Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer, + strlen(BinaryBuffer) + 1, 0, 0, 0, 0); if (Res != CUDA_SUCCESS) { - fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog); + fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog); exit(-1); } Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize); if (Res != CUDA_SUCCESS) { - fprintf(stdout, "Complete ptx linker step failed.\n"); - fprintf(stdout, "\n%s\n", ErrorLog); + fprintf(stderr, "Complete ptx linker step failed.\n"); + fprintf(stderr, "\n%s\n", ErrorLog); exit(-1); } debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime, InfoLog); - Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut); + Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule), + CuOut); if (Res != CUDA_SUCCESS) { - fprintf(stdout, "Loading ptx assembly text failed.\n"); + fprintf(stderr, "Loading ptx assembly text failed.\n"); exit(-1); } - Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule, + Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda), + ((CUDAKernel *)Function->Kernel)->CudaModule, KernelName); if (Res != CUDA_SUCCESS) { - fprintf(stdout, "Loading kernel function failed.\n"); + fprintf(stderr, "Loading kernel function failed.\n"); exit(-1); } CuLinkDestroyFcnPtr(LState); - Function->PTXString = PTXBuffer; + ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer; if (CacheMode) { if (KernelCache[NextCacheItem]) - freeKernel(KernelCache[NextCacheItem]); + freeKernelCUDA(KernelCache[NextCacheItem]); KernelCache[NextCacheItem] = Function; @@ -416,44 +1280,37 @@ return Function; } -void polly_freeKernel(PollyGPUFunction *Kernel) { +static void synchronizeDeviceCUDA() { dump_function(); - - if (CacheMode) - return; - - freeKernel(Kernel); + if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) { + fprintf(stderr, "Synchronizing device and host memory failed.\n"); + exit(-1); + } } -void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, - long MemSize) { +static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { dump_function(); - CUdeviceptr CuDevData = DevData->Cuda; + CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda; CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize); } -void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, - long MemSize) { +static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { dump_function(); - if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) { - fprintf(stdout, "Copying results from device to host memory failed.\n"); - exit(-1); - } -} -void polly_synchronizeDevice() { - dump_function(); - if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) { - fprintf(stdout, "Synchronizing device and host memory failed.\n"); + if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda, + MemSize) != CUDA_SUCCESS) { + fprintf(stderr, "Copying results from device to host memory failed.\n"); exit(-1); } } -void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, - unsigned int GridDimY, unsigned int BlockDimX, - unsigned int BlockDimY, unsigned int BlockDimZ, - void **Parameters) { +static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { dump_function(); unsigned GridDimZ = 1; @@ -462,45 +1319,290 @@ void **Extra = 0; CUresult Res; - Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ, - BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes, - Stream, Parameters, Extra); + Res = + CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX, + GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ, + SharedMemBytes, Stream, Parameters, Extra); if (Res != CUDA_SUCCESS) { - fprintf(stdout, "Launching CUDA kernel failed.\n"); + fprintf(stderr, "Launching CUDA kernel failed.\n"); exit(-1); } } -void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { +static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) { dump_function(); - CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda); + CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; + CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda); + free(DevPtr); free(Allocation); } -PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { +static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) { dump_function(); PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); - if (DevData == 0) { - fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); + exit(-1); + } + DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr)); + if (DevData->DevicePtr == 0) { + fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); exit(-1); } - CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize); + CUresult Res = + CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize); if (Res != CUDA_SUCCESS) { - fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); + fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n"); exit(-1); } return DevData; } +static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) { + dump_function(); + + CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr; + return (void *)DevPtr->Cuda; +} + +static void freeContextCUDA(PollyGPUContext *Context) { + dump_function(); + + CUDAContext *Ctx = (CUDAContext *)Context->Context; + if (Ctx->Cuda) { + CuCtxDestroyFcnPtr(Ctx->Cuda); + free(Ctx); + free(Context); + } + + dlclose(HandleCuda); + dlclose(HandleCudaRT); +} + +#endif /* HAS_LIBCUDART */ +/******************************************************************************/ +/* API */ +/******************************************************************************/ + +PollyGPUContext *polly_initContext() { + DebugMode = getenv("POLLY_DEBUG") != 0; + CacheMode = getenv("POLLY_NOCACHE") == 0; + + dump_function(); + + PollyGPUContext *Context; + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + Context = initContextCUDA(); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + Context = initContextCL(); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } + + return Context; +} + +void polly_freeKernel(PollyGPUFunction *Kernel) { + dump_function(); + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + freeKernelCUDA(Kernel); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + freeKernelCL(Kernel); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } +} + +PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, + const char *KernelName) { + dump_function(); + + PollyGPUFunction *Function; + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + Function = getKernelCUDA(BinaryBuffer, KernelName); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + Function = getKernelCL(BinaryBuffer, KernelName); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } + + return Function; +} + +void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, + long MemSize) { + dump_function(); + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + copyFromHostToDeviceCUDA(HostData, DevData, MemSize); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + copyFromHostToDeviceCL(HostData, DevData, MemSize); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } +} + +void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, + long MemSize) { + dump_function(); + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + copyFromDeviceToHostCUDA(DevData, HostData, MemSize); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + copyFromDeviceToHostCL(DevData, HostData, MemSize); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } +} + +void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, + unsigned int GridDimY, unsigned int BlockDimX, + unsigned int BlockDimY, unsigned int BlockDimZ, + void **Parameters) { + dump_function(); + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, + BlockDimZ, Parameters); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ, + Parameters); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } +} + +void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { + dump_function(); + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + freeDeviceMemoryCUDA(Allocation); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + freeDeviceMemoryCL(Allocation); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } +} + +PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { + dump_function(); + + PollyGPUDevicePtr *DevData; + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + DevData = allocateMemoryForDeviceCUDA(MemSize); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + DevData = allocateMemoryForDeviceCL(MemSize); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } + + return DevData; +} + void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) { dump_function(); - return (void *)Allocation->Cuda; + void *DevPtr; + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + DevPtr = getDevicePtrCUDA(Allocation); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + DevPtr = getDevicePtrCL(Allocation); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } + + return DevPtr; +} + +void polly_synchronizeDevice() { + dump_function(); + + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + synchronizeDeviceCUDA(); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + synchronizeDeviceCL(); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); + } } void polly_freeContext(PollyGPUContext *Context) { @@ -509,11 +1611,40 @@ if (CacheMode) return; - if (Context->Cuda) { - CuCtxDestroyFcnPtr(Context->Cuda); - free(Context); + switch (Runtime) { +#ifdef HAS_LIBCUDART + case RUNTIME_CUDA: + freeContextCUDA(Context); + break; +#endif /* HAS_LIBCUDART */ +#ifdef HAS_LIBOPENCL + case RUNTIME_CL: + freeContextCL(Context); + break; +#endif /* HAS_LIBOPENCL */ + default: + err_runtime(); } +} - dlclose(HandleCuda); - dlclose(HandleCudaRT); +/* Initialize GPUJIT with CUDA as runtime library. */ +PollyGPUContext *polly_initContextCUDA() { +#ifdef HAS_LIBCUDART + Runtime = RUNTIME_CUDA; + return polly_initContext(); +#else + fprintf(stderr, "GPU Runtime was built without CUDA support.\n"); + exit(-1); +#endif /* HAS_LIBCUDART */ +} + +/* Initialize GPUJIT with OpenCL as runtime library. */ +PollyGPUContext *polly_initContextCL() { +#ifdef HAS_LIBOPENCL + Runtime = RUNTIME_CL; + return polly_initContext(); +#else + fprintf(stderr, "GPU Runtime was built without OpenCL support.\n"); + exit(-1); +#endif /* HAS_LIBOPENCL */ }