Index: polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h =================================================================== --- polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h +++ polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h @@ -16,7 +16,7 @@ #define POLLY_PPCGCODEGENERATION_H /// The GPU Architecture to target. -enum GPUArch { NVPTX64 }; +enum GPUArch { NVPTX64, SPIR32, SPIR64 }; /// The GPU Runtime implementation to use. enum GPURuntime { CUDA, OpenCL }; Index: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp +++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp @@ -545,6 +545,11 @@ /// @param The kernel to generate the intrinsic functions for. void insertKernelIntrinsics(ppcg_kernel *Kernel); + /// Insert function calls to retrieve the SPIR group/local ids. + /// + /// @param The kernel to generate the function calls for. + void insertKernelCallsSPIR(ppcg_kernel *Kernel); + /// Setup the creation of functions referenced by the GPU kernel. /// /// 1. Create new function declarations in GPUModule which are the same as @@ -1254,10 +1259,24 @@ void GPUNodeBuilder::createKernelSync() { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + const char *SpirName = "__gen_ocl_barrier_global"; Function *Sync; switch (Arch) { + case GPUArch::SPIR64: + case GPUArch::SPIR32: + Sync = M->getFunction(SpirName); + + // If Sync is not available, declare it. + if (!Sync) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + Sync = Function::Create(Ty, Linkage, SpirName, M); + Sync->setCallingConv(CallingConv::SPIR_FUNC); + } + break; case GPUArch::NVPTX64: Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); break; @@ -1668,7 +1687,8 @@ finalizeKernelArguments(Kernel); Function *F = Builder.GetInsertBlock()->getParent(); - addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); + if (Arch == GPUArch::NVPTX64) + addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); clearDominators(F); clearScalarEvolution(F); clearLoops(F); @@ -1725,12 +1745,35 @@ return Ret; } +/// Compute the DataLayout string for a SPIR kernel. +/// +/// @param is64Bit Are we looking for a 64 bit architecture? +static std::string computeSPIRDataLayout(bool is64Bit) { + std::string Ret = ""; + + if (!is64Bit) { + Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" + "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" + "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" + "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; + } else { + Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" + "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" + "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" + "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; + } + + return Ret; +} + Function * GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, SetVector &SubtreeValues) { std::vector Args; std::string Identifier = getKernelFuncName(Kernel->id); + std::vector MemoryType; + for (long i = 0; i < Prog->n_array; i++) { if (!ppcg_kernel_requires_array_argument(Kernel, i)) continue; @@ -1739,16 +1782,23 @@ isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); Args.push_back(SAI->getElementType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); } else { static const int UseGlobalMemory = 1; Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory)); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1))); } } int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); - for (long i = 0; i < NumHostIters; i++) + for (long i = 0; i < NumHostIters; i++) { Args.push_back(Builder.getInt64Ty()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); + } int NumVars = isl_space_dim(Kernel->space, isl_dim_param); @@ -1757,19 +1807,49 @@ Value *Val = IDToValue[Id]; isl_id_free(Id); Args.push_back(Val->getType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); } - for (auto *V : SubtreeValues) + for (auto *V : SubtreeValues) { Args.push_back(V->getType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); + } auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, GPUModule.get()); + std::vector EmptyStrings; + + for (unsigned int i = 0; i < MemoryType.size(); i++) { + EmptyStrings.push_back(MDString::get(FN->getContext(), "")); + } + + if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) { + FN->setMetadata("kernel_arg_addr_space", + MDNode::get(FN->getContext(), MemoryType)); + FN->setMetadata("kernel_arg_name", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_access_qual", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_type", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_type_qual", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_base_type", + MDNode::get(FN->getContext(), EmptyStrings)); + } + switch (Arch) { case GPUArch::NVPTX64: FN->setCallingConv(CallingConv::PTX_Kernel); break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + FN->setCallingConv(CallingConv::SPIR_KERNEL); + break; } auto Arg = FN->arg_begin(); @@ -1835,6 +1915,9 @@ Intrinsic::ID IntrinsicsTID[3]; switch (Arch) { + case GPUArch::SPIR64: + case GPUArch::SPIR32: + llvm_unreachable("Cannot generate NVVM intrinsics for SPIR"); case GPUArch::NVPTX64: IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; @@ -1866,6 +1949,41 @@ } } +void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel) { + const char *GroupName[3] = {"__gen_ocl_get_group_id0", + "__gen_ocl_get_group_id1", + "__gen_ocl_get_group_id2"}; + + const char *LocalName[3] = {"__gen_ocl_get_local_id0", + "__gen_ocl_get_local_id1", + "__gen_ocl_get_local_id2"}; + + auto createFunc = [this](const char *Name, __isl_take isl_id *Id) mutable { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *FN = M->getFunction(Name); + + // If FN is not available, declare it. + if (!FN) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Args, false); + FN = Function::Create(Ty, Linkage, Name, M); + FN->setCallingConv(CallingConv::SPIR_FUNC); + } + + Value *Val = Builder.CreateCall(FN, {}); + Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); + IDToValue[Id] = Val; + KernelIDs.insert(std::unique_ptr(Id)); + }; + + for (int i = 0; i < Kernel->n_grid; ++i) + createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i)); + + for (int i = 0; i < Kernel->n_block; ++i) + createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i)); +} + void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { auto Arg = FN->arg_begin(); for (long i = 0; i < Kernel->n_array; i++) { @@ -2004,6 +2122,14 @@ GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); break; + case GPUArch::SPIR32: + GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown")); + GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */)); + break; + case GPUArch::SPIR64: + GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown")); + GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */)); + break; } Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); @@ -2021,7 +2147,16 @@ prepareKernelArguments(Kernel, FN); createKernelVariables(Kernel, FN); - insertKernelIntrinsics(Kernel); + + switch (Arch) { + case GPUArch::NVPTX64: + insertKernelIntrinsics(Kernel); + break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + insertKernelCallsSPIR(Kernel); + break; + } } std::string GPUNodeBuilder::createKernelASM() { @@ -2038,6 +2173,13 @@ break; } break; + case GPUArch::SPIR64: + case GPUArch::SPIR32: + std::string SPIRAssembly; + raw_string_ostream IROstream(SPIRAssembly); + IROstream << *GPUModule; + IROstream.flush(); + return SPIRAssembly; } std::string ErrMsg; @@ -2057,6 +2199,9 @@ case GPUArch::NVPTX64: subtarget = CudaVersion; break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + llvm_unreachable("No subtarget for SPIR architecture"); } std::unique_ptr TargetM(GPUTarget->createTargetMachine( @@ -2097,13 +2242,15 @@ if (DumpKernelIR) outs() << *GPUModule << "\n"; - // Optimize module. - llvm::legacy::PassManager OptPasses; - PassManagerBuilder PassBuilder; - PassBuilder.OptLevel = 3; - PassBuilder.SizeLevel = 0; - PassBuilder.populateModulePassManager(OptPasses); - OptPasses.run(*GPUModule); + if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) { + // Optimize module. + llvm::legacy::PassManager OptPasses; + PassManagerBuilder PassBuilder; + PassBuilder.OptLevel = 3; + PassBuilder.SizeLevel = 0; + PassBuilder.populateModulePassManager(OptPasses); + OptPasses.run(*GPUModule); + } std::string Assembly = createKernelASM(); Index: polly/trunk/lib/Support/RegisterPasses.cpp =================================================================== --- polly/trunk/lib/Support/RegisterPasses.cpp +++ polly/trunk/lib/Support/RegisterPasses.cpp @@ -117,7 +117,11 @@ static cl::opt GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"), cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64", - "target NVIDIA 64-bit architecture")), + "target NVIDIA 64-bit architecture"), + clEnumValN(GPUArch::SPIR32, "spir32", + "target SPIR 32-bit architecture"), + clEnumValN(GPUArch::SPIR64, "spir64", + "target SPIR 64-bit architecture")), cl::init(GPUArch::NVPTX64), cl::ZeroOrMore, cl::cat(PollyCategory)); #endif Index: polly/trunk/test/GPGPU/spir-codegen.ll =================================================================== --- polly/trunk/test/GPGPU/spir-codegen.ll +++ polly/trunk/test/GPGPU/spir-codegen.ll @@ -0,0 +1,118 @@ +; RUN: opt -O3 -polly -polly-target=gpu \ +; RUN: -polly-gpu-arch=spir32 \ +; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \ +; RUN: FileCheck %s + +; REQUIRES: pollyacc + +; CHECK: target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +; CHECK-NEXT: target triple = "spir-unknown-unknown" + +; CHECK-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef0) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 { +; CHECK-NEXT: entry: +; CHECK-NEXT: %0 = call i32 @__gen_ocl_get_group_id0() +; CHECK-NEXT: %__gen_ocl_get_group_id0 = zext i32 %0 to i64 +; CHECK-NEXT: %1 = call i32 @__gen_ocl_get_group_id1() +; CHECK-NEXT: %__gen_ocl_get_group_id1 = zext i32 %1 to i64 +; CHECK-NEXT: %2 = call i32 @__gen_ocl_get_local_id0() +; CHECK-NEXT: %__gen_ocl_get_local_id0 = zext i32 %2 to i64 +; CHECK-NEXT: %3 = call i32 @__gen_ocl_get_local_id1() +; CHECK-NEXT: %__gen_ocl_get_local_id1 = zext i32 %3 to i64 +; CHECK-NEXT: br label %polly.loop_preheader + +; CHECK-LABEL: polly.loop_exit: ; preds = %polly.stmt.bb5 +; CHECK-NEXT: ret void + +; CHECK-LABEL: polly.loop_header: ; preds = %polly.stmt.bb5, %polly.loop_preheader +; CHECK-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ] +; CHECK-NEXT: %4 = mul nsw i64 32, %__gen_ocl_get_group_id0 +; CHECK-NEXT: %5 = add nsw i64 %4, %__gen_ocl_get_local_id0 +; CHECK-NEXT: %6 = mul nsw i64 32, %__gen_ocl_get_group_id1 +; CHECK-NEXT: %7 = add nsw i64 %6, %__gen_ocl_get_local_id1 +; CHECK-NEXT: %8 = mul nsw i64 16, %polly.indvar +; CHECK-NEXT: %9 = add nsw i64 %7, %8 +; CHECK-NEXT: br label %polly.stmt.bb5 + +; CHECK-LABEL: polly.stmt.bb5: ; preds = %polly.loop_header +; CHECK-NEXT: %10 = mul i64 %5, %9 +; CHECK-NEXT: %p_tmp6 = sitofp i64 %10 to float +; CHECK-NEXT: %polly.access.cast.MemRef0 = bitcast i8 addrspace(1)* %MemRef0 to float addrspace(1)* +; CHECK-NEXT: %11 = mul nsw i64 32, %__gen_ocl_get_group_id0 +; CHECK-NEXT: %12 = add nsw i64 %11, %__gen_ocl_get_local_id0 +; CHECK-NEXT: %polly.access.mul.MemRef0 = mul nsw i64 %12, 1024 +; CHECK-NEXT: %13 = mul nsw i64 32, %__gen_ocl_get_group_id1 +; CHECK-NEXT: %14 = add nsw i64 %13, %__gen_ocl_get_local_id1 +; CHECK-NEXT: %15 = mul nsw i64 16, %polly.indvar +; CHECK-NEXT: %16 = add nsw i64 %14, %15 +; CHECK-NEXT: %polly.access.add.MemRef0 = add nsw i64 %polly.access.mul.MemRef0, %16 +; CHECK-NEXT: %polly.access.MemRef0 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef0, i64 %polly.access.add.MemRef0 +; CHECK-NEXT: %tmp8_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef0, align 4 +; CHECK-NEXT: %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6 +; CHECK-NEXT: %polly.access.cast.MemRef01 = bitcast i8 addrspace(1)* %MemRef0 to float addrspace(1)* +; CHECK-NEXT: %17 = mul nsw i64 32, %__gen_ocl_get_group_id0 +; CHECK-NEXT: %18 = add nsw i64 %17, %__gen_ocl_get_local_id0 +; CHECK-NEXT: %polly.access.mul.MemRef02 = mul nsw i64 %18, 1024 +; CHECK-NEXT: %19 = mul nsw i64 32, %__gen_ocl_get_group_id1 +; CHECK-NEXT: %20 = add nsw i64 %19, %__gen_ocl_get_local_id1 +; CHECK-NEXT: %21 = mul nsw i64 16, %polly.indvar +; CHECK-NEXT: %22 = add nsw i64 %20, %21 +; CHECK-NEXT: %polly.access.add.MemRef03 = add nsw i64 %polly.access.mul.MemRef02, %22 +; CHECK-NEXT: %polly.access.MemRef04 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef01, i64 %polly.access.add.MemRef03 +; CHECK-NEXT: store float %p_tmp9, float addrspace(1)* %polly.access.MemRef04, align 4 +; CHECK-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 +; CHECK-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar_next, 1 +; CHECK-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit + +; CHECK-LABEL: polly.loop_preheader: ; preds = %entry +; CHECK-NEXT: br label %polly.loop_header + +; CHECK: attributes #0 = { "polly.skip.fn" } + +; void double_parallel_loop(float A[][1024]) { +; for (long i = 0; i < 1024; i++) +; for (long j = 0; j < 1024; j++) +; A[i][j] += i * j; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @double_parallel_loop([1024 x float]* %A) { +bb: + br label %bb2 + +bb2: ; preds = %bb13, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ] + %exitcond1 = icmp ne i64 %i.0, 1024 + br i1 %exitcond1, label %bb3, label %bb15 + +bb3: ; preds = %bb2 + br label %bb4 + +bb4: ; preds = %bb10, %bb3 + %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ] + %exitcond = icmp ne i64 %j.0, 1024 + br i1 %exitcond, label %bb5, label %bb12 + +bb5: ; preds = %bb4 + %tmp = mul nuw nsw i64 %i.0, %j.0 + %tmp6 = sitofp i64 %tmp to float + %tmp7 = getelementptr inbounds [1024 x float], [1024 x float]* %A, i64 %i.0, i64 %j.0 + %tmp8 = load float, float* %tmp7, align 4 + %tmp9 = fadd float %tmp8, %tmp6 + store float %tmp9, float* %tmp7, align 4 + br label %bb10 + +bb10: ; preds = %bb5 + %tmp11 = add nuw nsw i64 %j.0, 1 + br label %bb4 + +bb12: ; preds = %bb4 + br label %bb13 + +bb13: ; preds = %bb12 + %tmp14 = add nuw nsw i64 %i.0, 1 + br label %bb2 + +bb15: ; preds = %bb2 + ret void +} Index: polly/trunk/tools/GPURuntime/GPUJIT.c =================================================================== --- polly/trunk/tools/GPURuntime/GPUJIT.c +++ polly/trunk/tools/GPURuntime/GPUJIT.c @@ -23,13 +23,14 @@ #include #else #include -#endif +#endif /* __APPLE__ */ #endif /* HAS_LIBOPENCL */ #include #include #include #include +#include static int DebugMode; static int CacheMode; @@ -89,6 +90,7 @@ /* Dynamic library handles for the OpenCL runtime library. */ static void *HandleOpenCL; +static void *HandleOpenCLBeignet; /* Type-defines of function pointer to OpenCL Runtime API. */ typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries, @@ -139,6 +141,12 @@ const cl_event *EventWaitList, cl_event *Event); static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr; +typedef cl_program +clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices, + const cl_device_id *DeviceList, + const char *Filename, cl_int *ErrcodeRet); +static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr; + typedef cl_program clCreateProgramWithBinaryFcnTy( cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList, const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus, @@ -210,6 +218,7 @@ } static int initialDeviceAPILibrariesCL() { + HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY); HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY); if (!HandleOpenCL) { fprintf(stderr, "Cannot open library: %s. \n", dlerror()); @@ -237,67 +246,79 @@ if (initialDeviceAPILibrariesCL() == 0) return 0; + // FIXME: We are now always selecting the Intel Beignet driver if it is + // available on the system, instead of a possible NVIDIA or AMD OpenCL + // API. This selection should occurr based on the target architecture + // chosen when compiling. + void *Handle = + (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL); + clGetPlatformIDsFcnPtr = - (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs"); + (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs"); clGetDeviceIDsFcnPtr = - (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs"); + (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs"); clGetDeviceInfoFcnPtr = - (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo"); + (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo"); clGetKernelInfoFcnPtr = - (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo"); + (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo"); clCreateContextFcnPtr = - (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext"); + (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext"); clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL( - HandleOpenCL, "clCreateCommandQueue"); + Handle, "clCreateCommandQueue"); clCreateBufferFcnPtr = - (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer"); + (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer"); clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL( - HandleOpenCL, "clEnqueueWriteBuffer"); + Handle, "clEnqueueWriteBuffer"); + + if (HandleOpenCLBeignet) + clCreateProgramWithLLVMIntelFcnPtr = + (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL( + Handle, "clCreateProgramWithLLVMIntel"); clCreateProgramWithBinaryFcnPtr = (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL( - HandleOpenCL, "clCreateProgramWithBinary"); + Handle, "clCreateProgramWithBinary"); clBuildProgramFcnPtr = - (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram"); + (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram"); clCreateKernelFcnPtr = - (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel"); + (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel"); clSetKernelArgFcnPtr = - (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg"); + (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg"); clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL( - HandleOpenCL, "clEnqueueNDRangeKernel"); + Handle, "clEnqueueNDRangeKernel"); - clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL( - HandleOpenCL, "clEnqueueReadBuffer"); + clEnqueueReadBufferFcnPtr = + (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer"); - clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush"); + clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush"); - clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish"); + clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish"); clReleaseKernelFcnPtr = - (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel"); + (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel"); clReleaseProgramFcnPtr = - (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram"); + (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram"); - clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL( - HandleOpenCL, "clReleaseMemObject"); + clReleaseMemObjectFcnPtr = + (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject"); clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL( - HandleOpenCL, "clReleaseCommandQueue"); + Handle, "clReleaseCommandQueue"); clReleaseContextFcnPtr = - (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext"); + (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext"); return 1; } @@ -481,12 +502,32 @@ } cl_int Ret; - size_t BinarySize = strlen(BinaryBuffer); - ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr( - ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID, - (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL, - &Ret); - checkOpenCLError(Ret, "Failed to create program from binary.\n"); + + if (HandleOpenCLBeignet) { + // TODO: This is a workaround, since clCreateProgramWithLLVMIntel only + // accepts a filename to a valid llvm-ir file as an argument, instead + // of accepting the BinaryBuffer directly. + FILE *fp = fopen("kernel.ll", "wb"); + if (fp != NULL) { + fputs(BinaryBuffer, fp); + fclose(fp); + } + + ((OpenCLKernel *)Function->Kernel)->Program = + clCreateProgramWithLLVMIntelFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->Context, 1, + &GlobalDeviceID, "kernel.ll", &Ret); + checkOpenCLError(Ret, "Failed to create program from llvm.\n"); + unlink("kernel.ll"); + } else { + size_t BinarySize = strlen(BinaryBuffer); + ((OpenCLKernel *)Function->Kernel)->Program = + clCreateProgramWithBinaryFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->Context, 1, + &GlobalDeviceID, (const size_t *)&BinarySize, + (const unsigned char **)&BinaryBuffer, NULL, &Ret); + checkOpenCLError(Ret, "Failed to create program from binary.\n"); + } Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1, &GlobalDeviceID, NULL, NULL, NULL);