Index: CMakeLists.txt =================================================================== --- CMakeLists.txt +++ CMakeLists.txt @@ -172,6 +172,11 @@ INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} ) endif(OpenCL_FOUND) +option(USE_INTEL_OCL "Uses the Intel Beignet driver for GPGPU code" OFF) +if (USE_INTEL_OCL) + add_definitions(-DHAS_INTEL_OCL) +endif(USE_INTEL_OCL) + option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON) if (NOT POLLY_BUNDLED_ISL) find_package(ISL MODULE REQUIRED) Index: include/polly/CodeGen/PPCGCodeGeneration.h =================================================================== --- include/polly/CodeGen/PPCGCodeGeneration.h +++ include/polly/CodeGen/PPCGCodeGeneration.h @@ -16,7 +16,7 @@ #define POLLY_PPCGCODEGENERATION_H /// The GPU Architecture to target. -enum GPUArch { NVPTX64 }; +enum GPUArch { NVPTX64, SPIR32, SPIR64 }; /// The GPU Runtime implementation to use. enum GPURuntime { CUDA, OpenCL }; Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -37,6 +37,8 @@ #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include + #include "isl/union_map.h" extern "C" { @@ -570,6 +572,11 @@ /// @returns A string containing the corresponding PTX assembly code. std::string createKernelASM(); + /// Create a SPIR string for the current GPU kernel. + /// + /// @returns A string containing the corresponding SPIR code. + std::string createKernelSPIR(std::string IR); + /// Remove references from the dominator tree to the kernel function @p F. /// /// @param F The function to remove references to. @@ -1230,10 +1237,24 @@ void GPUNodeBuilder::createKernelSync() { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + const char *SpirName = "__gen_ocl_barrier_global"; Function *Sync; switch (Arch) { + case GPUArch::SPIR64: + case GPUArch::SPIR32: + Sync = M->getFunction(SpirName); + + // If Sync is not available, declare it. + if (!Sync) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + Sync = Function::Create(Ty, Linkage, SpirName, M); + Sync->setCallingConv(CallingConv::SPIR_FUNC); + } + break; case GPUArch::NVPTX64: Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); break; @@ -1629,7 +1650,8 @@ finalizeKernelArguments(Kernel); Function *F = Builder.GetInsertBlock()->getParent(); - addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); + if (Arch == GPUArch::NVPTX64) + addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); clearDominators(F); clearScalarEvolution(F); clearLoops(F); @@ -1686,12 +1708,35 @@ return Ret; } +/// Compute the DataLayout string for a SPIR kernel. +/// +/// @param is64Bit Are we looking for a 64 bit architecture? +static std::string computeSPIRDataLayout(bool is64Bit) { + std::string Ret = ""; + + if (!is64Bit) { + Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" + "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" + "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" + "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; + } else { + Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" + "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" + "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" + "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; + } + + return Ret; +} + Function * GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, SetVector &SubtreeValues) { std::vector Args; std::string Identifier = getKernelFuncName(Kernel->id); + std::vector MemoryType; + for (long i = 0; i < Prog->n_array; i++) { if (!ppcg_kernel_requires_array_argument(Kernel, i)) continue; @@ -1700,16 +1745,23 @@ isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); Args.push_back(SAI->getElementType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); } else { static const int UseGlobalMemory = 1; Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory)); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1))); } } int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); - for (long i = 0; i < NumHostIters; i++) + for (long i = 0; i < NumHostIters; i++) { Args.push_back(Builder.getInt64Ty()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); + } int NumVars = isl_space_dim(Kernel->space, isl_dim_param); @@ -1718,19 +1770,49 @@ Value *Val = IDToValue[Id]; isl_id_free(Id); Args.push_back(Val->getType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); } - for (auto *V : SubtreeValues) + for (auto *V : SubtreeValues) { Args.push_back(V->getType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); + } auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, GPUModule.get()); + std::vector EmptyStrings; + + for (unsigned int i = 0; i < MemoryType.size(); i++) { + EmptyStrings.push_back(MDString::get(FN->getContext(), "")); + } + + if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) { + FN->setMetadata("kernel_arg_addr_space", + MDNode::get(FN->getContext(), MemoryType)); + FN->setMetadata("kernel_arg_name", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_access_qual", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_type", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_type_qual", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_base_type", + MDNode::get(FN->getContext(), EmptyStrings)); + } + switch (Arch) { case GPUArch::NVPTX64: FN->setCallingConv(CallingConv::PTX_Kernel); break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + FN->setCallingConv(CallingConv::SPIR_KERNEL); + break; } auto Arg = FN->arg_begin(); @@ -1796,6 +1878,8 @@ Intrinsic::ID IntrinsicsTID[3]; switch (Arch) { + case GPUArch::SPIR64: + case GPUArch::SPIR32: case GPUArch::NVPTX64: IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; @@ -1965,6 +2049,14 @@ GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); break; + case GPUArch::SPIR32: + GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown")); + GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */)); + break; + case GPUArch::SPIR64: + GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown")); + GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */)); + break; } Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); @@ -1999,6 +2091,10 @@ break; } break; + case GPUArch::SPIR64: + case GPUArch::SPIR32: + llvm_unreachable("Cannot generate ASM for SPIR architecture"); + break; } std::string ErrMsg; @@ -2018,6 +2114,10 @@ case GPUArch::NVPTX64: subtarget = CudaVersion; break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + llvm_unreachable("No subtarget for SPIR architecture"); + break; } std::unique_ptr TargetM(GPUTarget->createTargetMachine( @@ -2040,6 +2140,43 @@ return ASMStream.str(); } +std::string StringReplace(std::string const &in, std::string const &replace, + std::string const &with) { + return std::regex_replace(in, std::regex(replace), with); +} + +std::string GPUNodeBuilder::createKernelSPIR(std::string IR) { + IR = StringReplace(IR, "declare i32 @llvm.nvvm.read.ptx.sreg.tid.x\\(\\)", + "declare spir_func i32 @__gen_ocl_get_local_id0()"); + IR = StringReplace(IR, "declare i32 @llvm.nvvm.read.ptx.sreg.tid.y\\(\\)", + "declare spir_func i32 @__gen_ocl_get_local_id1()"); + IR = StringReplace(IR, "declare i32 @llvm.nvvm.read.ptx.sreg.tid.z\\(\\)", + "declare spir_func i32 @__gen_ocl_get_local_id2()"); + + IR = StringReplace(IR, "declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x\\(\\)", + "declare spir_func i32 @__gen_ocl_get_group_id0()"); + IR = StringReplace(IR, "declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y\\(\\)", + "declare spir_func i32 @__gen_ocl_get_group_id1()"); + IR = StringReplace(IR, "declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z\\(\\)", + "declare spir_func i32 @__gen_ocl_get_group_id2()"); + + IR = StringReplace(IR, "call i32 @llvm.nvvm.read.ptx.sreg.tid.x\\(\\)", + "call spir_func i32 @__gen_ocl_get_local_id0()"); + IR = StringReplace(IR, "call i32 @llvm.nvvm.read.ptx.sreg.tid.y\\(\\)", + "call spir_func i32 @__gen_ocl_get_local_id1()"); + IR = StringReplace(IR, "call i32 @llvm.nvvm.read.ptx.sreg.tid.z\\(\\)", + "call spir_func i32 @__gen_ocl_get_local_id2()"); + + IR = StringReplace(IR, "call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x\\(\\)", + "call spir_func i32 @__gen_ocl_get_group_id0()"); + IR = StringReplace(IR, "call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y\\(\\)", + "call spir_func i32 @__gen_ocl_get_group_id1()"); + IR = StringReplace(IR, "call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z\\(\\)", + "call spir_func i32 @__gen_ocl_get_group_id2()"); + + return IR; +} + std::string GPUNodeBuilder::finalizeKernelFunction() { if (verifyModule(*GPUModule)) { @@ -2056,15 +2193,27 @@ if (DumpKernelIR) outs() << *GPUModule << "\n"; - // Optimize module. - llvm::legacy::PassManager OptPasses; - PassManagerBuilder PassBuilder; - PassBuilder.OptLevel = 3; - PassBuilder.SizeLevel = 0; - PassBuilder.populateModulePassManager(OptPasses); - OptPasses.run(*GPUModule); + if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) { + // Optimize module. + llvm::legacy::PassManager OptPasses; + PassManagerBuilder PassBuilder; + PassBuilder.OptLevel = 3; + PassBuilder.SizeLevel = 0; + PassBuilder.populateModulePassManager(OptPasses); + OptPasses.run(*GPUModule); + } + + std::string Assembly; - std::string Assembly = createKernelASM(); + if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) { + std::string IR; + raw_string_ostream IROstream(IR); + IROstream << *GPUModule; + IROstream.flush(); + Assembly = createKernelSPIR(IR); + } else { + Assembly = createKernelASM(); + } if (DumpKernelASM) outs() << Assembly << "\n"; Index: lib/Support/RegisterPasses.cpp =================================================================== --- lib/Support/RegisterPasses.cpp +++ lib/Support/RegisterPasses.cpp @@ -117,7 +117,11 @@ static cl::opt GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"), cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64", - "target NVIDIA 64-bit architecture")), + "target NVIDIA 64-bit architecture"), + clEnumValN(GPUArch::SPIR32, "spir32", + "target SPIR 32-bit architecture"), + clEnumValN(GPUArch::SPIR64, "spir64", + "target SPIR 64-bit architecture")), cl::init(GPUArch::NVPTX64), cl::ZeroOrMore, cl::cat(PollyCategory)); #endif Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -22,14 +22,19 @@ #ifdef __APPLE__ #include #else +#ifdef HAS_INTEL_OCL +#include +#else #include -#endif +#endif /* HAS_INTEL_OCL */ +#endif /* __APPLE__ */ #endif /* HAS_LIBOPENCL */ #include #include #include #include +#include static int DebugMode; static int CacheMode; @@ -139,6 +144,12 @@ const cl_event *EventWaitList, cl_event *Event); static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr; +typedef cl_program +clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices, + const cl_device_id *DeviceList, + const char *Filename, cl_int *ErrcodeRet); +static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr; + typedef cl_program clCreateProgramWithBinaryFcnTy( cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList, const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus, @@ -210,7 +221,11 @@ } static int initialDeviceAPILibrariesCL() { +#ifdef HAS_INTEL_OCL + HandleOpenCL = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY); +#else HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY); +#endif /* HAS_INTEL_OCL */ if (!HandleOpenCL) { fprintf(stderr, "Cannot open library: %s. \n", dlerror()); return 0; @@ -261,6 +276,10 @@ clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL( HandleOpenCL, "clEnqueueWriteBuffer"); + clCreateProgramWithLLVMIntelFcnPtr = + (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL( + HandleOpenCL, "clCreateProgramWithLLVMIntel"); + clCreateProgramWithBinaryFcnPtr = (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL( HandleOpenCL, "clCreateProgramWithBinary"); @@ -481,12 +500,28 @@ } cl_int Ret; + +#ifdef HAS_INTEL_OCL + FILE *fp = fopen("kernel.ll", "wb"); + if (fp != NULL) { + fputs(BinaryBuffer, fp); + fclose(fp); + } + + ((OpenCLKernel *)Function->Kernel)->Program = + clCreateProgramWithLLVMIntelFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->Context, 1, + &GlobalDeviceID, "kernel.ll", &Ret); + checkOpenCLError(Ret, "Failed to create program from llvm.\n"); + unlink("kernel.ll"); +#else size_t BinarySize = strlen(BinaryBuffer); ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr( ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID, (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL, &Ret); checkOpenCLError(Ret, "Failed to create program from binary.\n"); +#endif /* HAS_INTEL_OCL */ Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1, &GlobalDeviceID, NULL, NULL, NULL);