Index: include/polly/CodeGen/PPCGCodeGeneration.h =================================================================== --- include/polly/CodeGen/PPCGCodeGeneration.h +++ include/polly/CodeGen/PPCGCodeGeneration.h @@ -16,7 +16,7 @@ #define POLLY_PPCGCODEGENERATION_H /// The GPU Architecture to target. -enum GPUArch { NVPTX64, SPIR32, SPIR64 }; +enum GPUArch { NVPTX64, SPIR32, SPIR64, AMDGCN64 }; /// The GPU Runtime implementation to use. enum GPURuntime { CUDA, OpenCL }; Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -53,6 +53,8 @@ #include "llvm/Support/Debug.h" +#include + using namespace polly; using namespace llvm; @@ -118,6 +120,11 @@ cl::desc("The CUDA version to compile for"), cl::Hidden, cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt + AMDArch("polly-acc-amd-arch", + cl::desc("The AMD architecture version to compile for"), cl::Hidden, + cl::init("gfx803"), cl::ZeroOrMore, cl::cat(PollyCategory)); + static cl::opt MinCompute("polly-acc-mincompute", cl::desc("Minimal number of compute statements to run on GPU."), @@ -715,10 +722,11 @@ /// Create a call to get a kernel from an assembly string. /// /// @param Buffer The string describing the kernel. + /// @param Size The size of the string describing the kernel. /// @param Entry The name of the kernel function to call. /// /// @returns A pointer to a kernel object - Value *createCallGetKernel(Value *Buffer, Value *Entry); + Value *createCallGetKernel(Value *Buffer, Value *Size, Value *Entry); /// Create a call to free a GPU kernel. /// @@ -735,10 +743,13 @@ /// @param GridBlockZ The size of the third block dimension. /// @param Parameters A pointer to an array that contains itself pointers to /// the parameter values passed for each kernel argument. + /// @param CLUseLocalWorkSize A boolean dictating whether the OpenCL Runtime + /// should automatically determine the Local work + /// group size or use the provided dimensions. void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, Value *GridDimY, Value *BlockDimX, Value *BlockDimY, Value *BlockDimZ, - Value *Parameters); + Value *Parameters, Value *CLUseLocalWorkSize); }; std::string GPUNodeBuilder::getKernelFuncName(int Kernel_id) { @@ -861,7 +872,8 @@ createCallFreeDeviceMemory(Array.second); } -Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { +Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Size, + Value *Entry) { const char *Name = "polly_getKernel"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -871,12 +883,13 @@ GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; std::vector Args; Args.push_back(Builder.getInt8PtrTy()); + Args.push_back(Builder.getInt64Ty()); Args.push_back(Builder.getInt8PtrTy()); FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); F = Function::Create(Ty, Linkage, Name, M); } - return Builder.CreateCall(F, {Buffer, Entry}); + return Builder.CreateCall(F, {Buffer, Size, Entry}); } Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { @@ -899,7 +912,8 @@ void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, Value *GridDimY, Value *BlockDimX, Value *BlockDimY, Value *BlockDimZ, - Value *Parameters) { + Value *Parameters, + Value *CLUseLocalWorkSize) { const char *Name = "polly_launchKernel"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -915,12 +929,13 @@ Args.push_back(Builder.getInt32Ty()); Args.push_back(Builder.getInt32Ty()); Args.push_back(Builder.getInt8PtrTy()); + Args.push_back(Builder.getInt32Ty()); FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); F = Function::Create(Ty, Linkage, Name, M); } Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, - BlockDimZ, Parameters}); + BlockDimZ, Parameters, CLUseLocalWorkSize}); } void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { @@ -1342,6 +1357,9 @@ case GPUArch::NVPTX64: Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); break; + case GPUArch::AMDGCN64: + Sync = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_s_barrier); + break; } Builder.CreateCall(Sync, {}); @@ -1833,14 +1851,28 @@ std::string Name = getKernelFuncName(Kernel->id); Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); + Value *KernelSize = Builder.getInt64(ASMString.length()); Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); - Value *GPUKernel = createCallGetKernel(KernelString, NameString); + Value *GPUKernel = createCallGetKernel(KernelString, KernelSize, NameString); Value *GridDimX, *GridDimY; std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); + Value *CLUseLocalWorkSize; + + switch (Arch) { + case GPUArch::SPIR32: + case GPUArch::SPIR64: + CLUseLocalWorkSize = Builder.getInt32(1); + break; + case GPUArch::AMDGCN64: + case GPUArch::NVPTX64: + CLUseLocalWorkSize = Builder.getInt32(0); + break; + } + createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, - BlockDimZ, Parameters); + BlockDimZ, Parameters, CLUseLocalWorkSize); createCallFreeKernel(GPUKernel); for (auto Id : KernelIds) @@ -1868,6 +1900,13 @@ return Ret; } +/// Compute the DataLayout string for the AMDGPU backend. +static std::string computeAMDGCNDataLayout() { + return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; +} + /// Compute the DataLayout string for a SPIR kernel. /// /// @param is64Bit Are we looking for a 64 bit architecture? @@ -1969,6 +2008,9 @@ case GPUArch::NVPTX64: FN->setCallingConv(CallingConv::PTX_Kernel); break; + case GPUArch::AMDGCN64: + FN->setCallingConv(CallingConv::AMDGPU_KERNEL); + break; case GPUArch::SPIR32: case GPUArch::SPIR64: FN->setCallingConv(CallingConv::SPIR_KERNEL); @@ -2050,6 +2092,14 @@ IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y; IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z; break; + case GPUArch::AMDGCN64: + IntrinsicsBID[0] = Intrinsic::amdgcn_workitem_id_x; + IntrinsicsBID[1] = Intrinsic::amdgcn_workitem_id_y; + + IntrinsicsTID[0] = Intrinsic::amdgcn_workgroup_id_x; + IntrinsicsTID[1] = Intrinsic::amdgcn_workgroup_id_y; + IntrinsicsTID[2] = Intrinsic::amdgcn_workgroup_id_z; + break; } auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { @@ -2254,6 +2304,13 @@ GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); break; + case GPUArch::AMDGCN64: + if (Runtime == GPURuntime::CUDA) + llvm_unreachable("Cannot generate AMD code for CUDA runtime"); + else if (Runtime == GPURuntime::OpenCL) + GPUModule->setTargetTriple(Triple::normalize("amdgcn-amd-amdhsa-opencl")); + GPUModule->setDataLayout(computeAMDGCNDataLayout()); + break; case GPUArch::SPIR32: GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown")); GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */)); @@ -2281,6 +2338,7 @@ createKernelVariables(Kernel, FN); switch (Arch) { + case GPUArch::AMDGCN64: case GPUArch::NVPTX64: insertKernelIntrinsics(Kernel); break; @@ -2305,6 +2363,16 @@ break; } break; + case GPUArch::AMDGCN64: + switch (Runtime) { + case GPURuntime::CUDA: + llvm_unreachable("Cannot generate AMD code for CUDA runtime"); + break; + case GPURuntime::OpenCL: + GPUTriple = llvm::Triple(Triple::normalize("amdgcn-amd-amdhsa-opencl")); + break; + } + break; case GPUArch::SPIR64: case GPUArch::SPIR32: std::string SPIRAssembly; @@ -2331,6 +2399,9 @@ case GPUArch::NVPTX64: subtarget = CudaVersion; break; + case GPUArch::AMDGCN64: + subtarget = AMDArch; + break; case GPUArch::SPIR32: case GPUArch::SPIR64: llvm_unreachable("No subtarget for SPIR architecture"); @@ -2345,15 +2416,48 @@ PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); - if (TargetM->addPassesToEmitFile( - PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { - errs() << "The target does not support generation of this file type!\n"; - return ""; + if (Arch == GPUArch::AMDGCN64) { + if (TargetM->addPassesToEmitFile( + PM, ASMStream, TargetMachine::CGFT_ObjectFile, true /* verify */)) { + errs() << "The target does not support generation of this file type!\n"; + return ""; + } + } else { + if (TargetM->addPassesToEmitFile(PM, ASMStream, + TargetMachine::CGFT_AssemblyFile, + true /* verify */)) { + errs() << "The target does not support generation of this file type!\n"; + return ""; + } } PM.run(*GPUModule); - return ASMStream.str(); + std::string Assembly = ASMStream.str(); + + if (Arch == GPUArch::AMDGCN64) { + std::string FileDir = "/tmp/"; + std::string FileName = "polly_temp_kernel_amd.asm"; + std::string OutName = "polly_temp_kernel_amd.bin"; + std::ofstream ASMFile; + ASMFile.open(FileDir + FileName); + ASMFile << Assembly; + ASMFile.close(); + + std::string LLDCommand; + LLDCommand = + "ld.lld -shared " + FileDir + FileName + " -o " + FileDir + OutName; + system(LLDCommand.c_str()); + + std::ifstream ifs(FileDir + OutName); + Assembly.assign((std::istreambuf_iterator(ifs)), + (std::istreambuf_iterator())); + + remove((FileDir + FileName).c_str()); + remove((FileDir + OutName).c_str()); + } + + return Assembly; } bool GPUNodeBuilder::requiresCUDALibDevice() { Index: lib/Support/RegisterPasses.cpp =================================================================== --- lib/Support/RegisterPasses.cpp +++ lib/Support/RegisterPasses.cpp @@ -124,6 +124,8 @@ GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"), cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64", "target NVIDIA 64-bit architecture"), + clEnumValN(GPUArch::AMDGCN64, "amdgcn64", + "target AMD GCN 64-bit architecture"), clEnumValN(GPUArch::SPIR32, "spir32", "target SPIR 32-bit architecture"), clEnumValN(GPUArch::SPIR64, "spir64", Index: test/GPGPU/add-scalars-in-scop-to-kills.ll =================================================================== --- test/GPGPU/add-scalars-in-scop-to-kills.ll +++ test/GPGPU/add-scalars-in-scop-to-kills.ll @@ -13,7 +13,7 @@ ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32) ; Check that we add variables that are local to a scop into the kills that we ; pass to PPCG. This should enable PPCG to codegen this example. Index: test/GPGPU/bounds-construction-with-ignore-param-bounds.ll =================================================================== --- test/GPGPU/bounds-construction-with-ignore-param-bounds.ll +++ test/GPGPU/bounds-construction-with-ignore-param-bounds.ll @@ -19,7 +19,7 @@ ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32) ; ModuleID = 'test/GPGPU/bounds-construction-with-ignore-param-bounds.ll' ; C pseudocode Index: test/GPGPU/cuda-managed-memory-simple.ll =================================================================== --- test/GPGPU/cuda-managed-memory-simple.ll +++ test/GPGPU/cuda-managed-memory-simple.ll @@ -46,8 +46,8 @@ ; CHECK-NEXT: store i8* %15, i8** %polly_launch_0_param_1 ; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8* ; CHECK-NEXT: store i8* %19, i8** %18 -; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([852 x i8], [852 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0)) -; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) +; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([852 x i8], [852 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i64 851, i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0)) +; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr, i32 0) ; CHECK-NEXT: call void @polly_freeKernel(i8* %20) ; CHECK-NEXT: call void @polly_synchronizeDevice() ; CHECK-NEXT: call void @polly_freeContext(i8* %13) Index: test/GPGPU/intrinsic-copied-into-kernel.ll =================================================================== --- test/GPGPU/intrinsic-copied-into-kernel.ll +++ test/GPGPU/intrinsic-copied-into-kernel.ll @@ -18,7 +18,7 @@ ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32) ; void f(float *A, float *B, int N) { Index: test/GPGPU/invariant-load-array-access.ll =================================================================== --- test/GPGPU/invariant-load-array-access.ll +++ test/GPGPU/invariant-load-array-access.ll @@ -22,7 +22,7 @@ ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32) ; This test makes sure that such an access pattern is handled correctly ; by PPCGCodeGeneration. It appears that not calling `preloadInvariantLoads` Index: test/GPGPU/invariant-load-hoisting-of-array.ll =================================================================== --- test/GPGPU/invariant-load-hoisting-of-array.ll +++ test/GPGPU/invariant-load-hoisting-of-array.ll @@ -54,7 +54,7 @@ ; Check that the kernel launch is generated in the host IR. ; This declaration would not have been generated unless a kernel launch exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32) ; C pseudocode equivalent Index: test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll =================================================================== --- test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll +++ test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll @@ -22,7 +22,7 @@ ; Check that the kernel launch is generated in the host IR. ; This declaration would not have been generated unless a kernel launch exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32) ; void f(int *begin, int *end, int *arr) { ; for (int i = *begin; i < *end; i++) { Index: test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll =================================================================== --- test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll +++ test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll @@ -19,7 +19,7 @@ ; Check that the kernel launch is generated in the host IR. ; This declaration would not have been generated unless a kernel launch exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32) ; ; void f(int *begin, int *arr) { Index: test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll =================================================================== --- test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll +++ test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll @@ -15,7 +15,7 @@ ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32) ; Check if we generate GPU code for simple loop with variable upper bound. ; This always worked, but have this test to prevent regressions. Index: test/GPGPU/invariant-load-hoisting.ll =================================================================== --- test/GPGPU/invariant-load-hoisting.ll +++ test/GPGPU/invariant-load-hoisting.ll @@ -17,7 +17,7 @@ ; SCOP-NEXT: [n, tmp12] -> { Stmt_for_body6[i0, i1, i2] -> MemRef_invariant[0] }; ; SCOP-NEXT: Execution Context: [n, tmp12] -> { : n > 0 } ; SCOP-NEXT: } -; HOST-IR: call void @polly_launchKernel(i8* %209, i32 %215, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) +; HOST-IR: call void @polly_launchKernel(i8* %209, i32 %215, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr, i32 0) ; HOST-IR-NEXT: call void @polly_freeKernel(i8* %209) ; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B, i8 addrspace(1)* %MemRef_A, i32 %n, i32 %tmp12, i32 %polly.preload.tmp21.merge) Index: test/GPGPU/libdevice-functions-copied-into-kernel.ll =================================================================== --- test/GPGPU/libdevice-functions-copied-into-kernel.ll +++ test/GPGPU/libdevice-functions-copied-into-kernel.ll @@ -24,7 +24,7 @@ ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32) ; void f(float *A, float *B, int N) { Index: test/GPGPU/privatization-simple.ll =================================================================== --- test/GPGPU/privatization-simple.ll +++ test/GPGPU/privatization-simple.ll @@ -9,7 +9,7 @@ ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32) ; void f(int A[], int B[], int control, int C[]) { ; int x; Index: test/GPGPU/privatization.ll =================================================================== --- test/GPGPU/privatization.ll +++ test/GPGPU/privatization.ll @@ -10,7 +10,7 @@ ; Check that kernel launch is generated in host IR. ; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*, i32) ; ; Index: tools/GPURuntime/GPUJIT.h =================================================================== --- tools/GPURuntime/GPUJIT.h +++ tools/GPURuntime/GPUJIT.h @@ -50,6 +50,7 @@ * PollyGPUDevicePtr *DevArray; * int *HostData; * int MemSize; + * size_t KernelSize = strlen(KernelString); * * int GridX = 8; * int GridY = 8; @@ -61,7 +62,7 @@ * MemSize = 256*64*sizeof(int); * Context = polly_initContext(); * DevArray = polly_allocateMemoryForDevice(MemSize); - * Kernel = polly_getKernel(KernelString, KernelName); + * Kernel = polly_getKernel(KernelString, KernelSize, KernelName); * * void *Params[1]; * void *DevPtr = polly_getDevicePtr(DevArray) @@ -98,6 +99,7 @@ PollyGPUContext *polly_initContextCUDA(); PollyGPUContext *polly_initContextCL(); PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, + const size_t BinarySize, const char *KernelName); void polly_freeKernel(PollyGPUFunction *Kernel); void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, @@ -108,7 +110,7 @@ void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, unsigned int GridDimY, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ, - void **Parameters); + void **Parameters, unsigned int CLUseLocalWorkSize); void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation); void polly_freeContext(PollyGPUContext *Context); Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -465,6 +465,7 @@ } static PollyGPUFunction *getKernelCL(const char *BinaryBuffer, + const size_t BinarySize, const char *KernelName) { dump_function(); @@ -523,7 +524,6 @@ checkOpenCLError(Ret, "Failed to create program from llvm.\n"); unlink("kernel.ll"); } else { - size_t BinarySize = strlen(BinaryBuffer); ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr( ((OpenCLContext *)GlobalContext->Context)->Context, 1, @@ -591,7 +591,7 @@ static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX, unsigned int GridDimY, unsigned int BlockDimX, unsigned int BlockDimY, unsigned int BlockDimZ, - void **Parameters) { + void **Parameters, unsigned int UseLocalWorkSize) { dump_function(); cl_int Ret; @@ -622,9 +622,14 @@ static const int WorkDim = 3; OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context; - Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel, - WorkDim, NULL, GlobalWorkSize, - LocalWorkSize, 0, NULL, NULL); + if (UseLocalWorkSize) + Ret = clEnqueueNDRangeKernelFcnPtr( + CLContext->CommandQueue, CLKernel->Kernel, WorkDim, NULL, + GlobalWorkSize, LocalWorkSize, 0, NULL, NULL); + else + Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, + CLKernel->Kernel, WorkDim, NULL, + GlobalWorkSize, NULL, 0, NULL, NULL); checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n"); } @@ -1619,6 +1624,7 @@ } PollyGPUFunction *polly_getKernel(const char *BinaryBuffer, + const size_t BinarySize, const char *KernelName) { dump_function(); @@ -1632,7 +1638,7 @@ #endif /* HAS_LIBCUDART */ #ifdef HAS_LIBOPENCL case RUNTIME_CL: - Function = getKernelCL(BinaryBuffer, KernelName); + Function = getKernelCL(BinaryBuffer, BinarySize, KernelName); break; #endif /* HAS_LIBOPENCL */ default: @@ -1685,7 +1691,7 @@ void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, unsigned int GridDimY, unsigned int BlockDimX, unsigned int BlockDimY, unsigned int BlockDimZ, - void **Parameters) { + void **Parameters, unsigned int CLUseLocalWorkSize) { dump_function(); switch (Runtime) { @@ -1698,7 +1704,7 @@ #ifdef HAS_LIBOPENCL case RUNTIME_CL: launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ, - Parameters); + Parameters, CLUseLocalWorkSize); break; #endif /* HAS_LIBOPENCL */ default: