Index: include/polly/CodeGen/PPCGCodeGeneration.h =================================================================== --- include/polly/CodeGen/PPCGCodeGeneration.h +++ include/polly/CodeGen/PPCGCodeGeneration.h @@ -16,7 +16,7 @@ #define POLLY_PPCGCODEGENERATION_H /// The GPU Architecture to target. -enum GPUArch { NVPTX64 }; +enum GPUArch { NVPTX64, SPIR32, SPIR64 }; /// The GPU Runtime implementation to use. enum GPURuntime { CUDA, OpenCL }; Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -532,6 +532,11 @@ /// @param The kernel to generate the intrinsic functions for. void insertKernelIntrinsics(ppcg_kernel *Kernel); + /// Insert function calls to retrieve the SPIR group/local ids. + /// + /// @param The kernel to generate the function calls for. + void insertIDCallsSPIR(ppcg_kernel *Kernel); + /// Setup the creation of functions referenced by the GPU kernel. /// /// 1. Create new function declarations in GPUModule which are the same as @@ -1230,10 +1235,24 @@ void GPUNodeBuilder::createKernelSync() { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + const char *SpirName = "__gen_ocl_barrier_global"; Function *Sync; switch (Arch) { + case GPUArch::SPIR64: + case GPUArch::SPIR32: + Sync = M->getFunction(SpirName); + + // If Sync is not available, declare it. + if (!Sync) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + Sync = Function::Create(Ty, Linkage, SpirName, M); + Sync->setCallingConv(CallingConv::SPIR_FUNC); + } + break; case GPUArch::NVPTX64: Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); break; @@ -1629,7 +1648,8 @@ finalizeKernelArguments(Kernel); Function *F = Builder.GetInsertBlock()->getParent(); - addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); + if (Arch == GPUArch::NVPTX64) + addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); clearDominators(F); clearScalarEvolution(F); clearLoops(F); @@ -1686,12 +1706,35 @@ return Ret; } +/// Compute the DataLayout string for a SPIR kernel. +/// +/// @param is64Bit Are we looking for a 64 bit architecture? +static std::string computeSPIRDataLayout(bool is64Bit) { + std::string Ret = ""; + + if (!is64Bit) { + Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" + "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" + "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" + "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; + } else { + Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" + "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" + "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" + "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; + } + + return Ret; +} + Function * GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, SetVector &SubtreeValues) { std::vector Args; std::string Identifier = getKernelFuncName(Kernel->id); + std::vector MemoryType; + for (long i = 0; i < Prog->n_array; i++) { if (!ppcg_kernel_requires_array_argument(Kernel, i)) continue; @@ -1700,16 +1743,23 @@ isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); Args.push_back(SAI->getElementType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); } else { static const int UseGlobalMemory = 1; Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory)); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1))); } } int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); - for (long i = 0; i < NumHostIters; i++) + for (long i = 0; i < NumHostIters; i++) { Args.push_back(Builder.getInt64Ty()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); + } int NumVars = isl_space_dim(Kernel->space, isl_dim_param); @@ -1718,19 +1768,49 @@ Value *Val = IDToValue[Id]; isl_id_free(Id); Args.push_back(Val->getType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); } - for (auto *V : SubtreeValues) + for (auto *V : SubtreeValues) { Args.push_back(V->getType()); + MemoryType.push_back( + ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); + } auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, GPUModule.get()); + std::vector EmptyStrings; + + for (unsigned int i = 0; i < MemoryType.size(); i++) { + EmptyStrings.push_back(MDString::get(FN->getContext(), "")); + } + + if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) { + FN->setMetadata("kernel_arg_addr_space", + MDNode::get(FN->getContext(), MemoryType)); + FN->setMetadata("kernel_arg_name", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_access_qual", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_type", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_type_qual", + MDNode::get(FN->getContext(), EmptyStrings)); + FN->setMetadata("kernel_arg_base_type", + MDNode::get(FN->getContext(), EmptyStrings)); + } + switch (Arch) { case GPUArch::NVPTX64: FN->setCallingConv(CallingConv::PTX_Kernel); break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + FN->setCallingConv(CallingConv::SPIR_KERNEL); + break; } auto Arg = FN->arg_begin(); @@ -1796,6 +1876,9 @@ Intrinsic::ID IntrinsicsTID[3]; switch (Arch) { + case GPUArch::SPIR64: + case GPUArch::SPIR32: + llvm_unreachable("Cannot generate NVVM intrinsics for SPIR"); case GPUArch::NVPTX64: IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; @@ -1827,6 +1910,41 @@ } } +void GPUNodeBuilder::insertIDCallsSPIR(ppcg_kernel *Kernel) { + const char *GroupName[3] = {"__gen_ocl_get_group_id0", + "__gen_ocl_get_group_id1", + "__gen_ocl_get_group_id2"}; + + const char *LocalName[3] = {"__gen_ocl_get_local_id0", + "__gen_ocl_get_local_id1", + "__gen_ocl_get_local_id2"}; + + auto createFunc = [this](const char *Name, __isl_take isl_id *Id) mutable { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *FN = M->getFunction(Name); + + // If FN is not available, declare it. + if (!FN) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector Args; + FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Args, false); + FN = Function::Create(Ty, Linkage, Name, M); + FN->setCallingConv(CallingConv::SPIR_FUNC); + } + + Value *Val = Builder.CreateCall(FN, {}); + Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); + IDToValue[Id] = Val; + KernelIDs.insert(std::unique_ptr(Id)); + }; + + for (int i = 0; i < Kernel->n_grid; ++i) + createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i)); + + for (int i = 0; i < Kernel->n_block; ++i) + createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i)); +} + void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { auto Arg = FN->arg_begin(); for (long i = 0; i < Kernel->n_array; i++) { @@ -1965,6 +2083,14 @@ GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); break; + case GPUArch::SPIR32: + GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown")); + GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */)); + break; + case GPUArch::SPIR64: + GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown")); + GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */)); + break; } Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); @@ -1982,7 +2108,16 @@ prepareKernelArguments(Kernel, FN); createKernelVariables(Kernel, FN); - insertKernelIntrinsics(Kernel); + + switch (Arch) { + case GPUArch::NVPTX64: + insertKernelIntrinsics(Kernel); + break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + insertIDCallsSPIR(Kernel); + break; + } } std::string GPUNodeBuilder::createKernelASM() { @@ -1999,6 +2134,10 @@ break; } break; + case GPUArch::SPIR64: + case GPUArch::SPIR32: + llvm_unreachable("Cannot generate ASM for SPIR architecture"); + break; } std::string ErrMsg; @@ -2018,6 +2157,10 @@ case GPUArch::NVPTX64: subtarget = CudaVersion; break; + case GPUArch::SPIR32: + case GPUArch::SPIR64: + llvm_unreachable("No subtarget for SPIR architecture"); + break; } std::unique_ptr TargetM(GPUTarget->createTargetMachine( @@ -2056,15 +2199,25 @@ if (DumpKernelIR) outs() << *GPUModule << "\n"; - // Optimize module. - llvm::legacy::PassManager OptPasses; - PassManagerBuilder PassBuilder; - PassBuilder.OptLevel = 3; - PassBuilder.SizeLevel = 0; - PassBuilder.populateModulePassManager(OptPasses); - OptPasses.run(*GPUModule); + if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) { + // Optimize module. + llvm::legacy::PassManager OptPasses; + PassManagerBuilder PassBuilder; + PassBuilder.OptLevel = 3; + PassBuilder.SizeLevel = 0; + PassBuilder.populateModulePassManager(OptPasses); + OptPasses.run(*GPUModule); + } + + std::string Assembly; - std::string Assembly = createKernelASM(); + if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) { + raw_string_ostream IROstream(Assembly); + IROstream << *GPUModule; + IROstream.flush(); + } else { + Assembly = createKernelASM(); + } if (DumpKernelASM) outs() << Assembly << "\n"; Index: lib/Support/RegisterPasses.cpp =================================================================== --- lib/Support/RegisterPasses.cpp +++ lib/Support/RegisterPasses.cpp @@ -117,7 +117,11 @@ static cl::opt GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"), cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64", - "target NVIDIA 64-bit architecture")), + "target NVIDIA 64-bit architecture"), + clEnumValN(GPUArch::SPIR32, "spir32", + "target SPIR 32-bit architecture"), + clEnumValN(GPUArch::SPIR64, "spir64", + "target SPIR 64-bit architecture")), cl::init(GPUArch::NVPTX64), cl::ZeroOrMore, cl::cat(PollyCategory)); #endif Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -23,13 +23,14 @@ #include #else #include -#endif +#endif /* __APPLE__ */ #endif /* HAS_LIBOPENCL */ #include #include #include #include +#include static int DebugMode; static int CacheMode; @@ -89,6 +90,7 @@ /* Dynamic library handles for the OpenCL runtime library. */ static void *HandleOpenCL; +static void *HandleOpenCLBeignet; /* Type-defines of function pointer to OpenCL Runtime API. */ typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries, @@ -139,6 +141,12 @@ const cl_event *EventWaitList, cl_event *Event); static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr; +typedef cl_program +clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices, + const cl_device_id *DeviceList, + const char *Filename, cl_int *ErrcodeRet); +static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr; + typedef cl_program clCreateProgramWithBinaryFcnTy( cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList, const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus, @@ -210,6 +218,7 @@ } static int initialDeviceAPILibrariesCL() { + HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY); HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY); if (!HandleOpenCL) { fprintf(stderr, "Cannot open library: %s. \n", dlerror()); @@ -237,67 +246,77 @@ if (initialDeviceAPILibrariesCL() == 0) return 0; + void *Handle = HandleOpenCL; + + if (HandleOpenCLBeignet) + Handle = HandleOpenCLBeignet; + clGetPlatformIDsFcnPtr = - (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs"); + (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs"); clGetDeviceIDsFcnPtr = - (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs"); + (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs"); clGetDeviceInfoFcnPtr = - (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo"); + (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo"); clGetKernelInfoFcnPtr = - (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo"); + (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo"); clCreateContextFcnPtr = - (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext"); + (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext"); clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL( - HandleOpenCL, "clCreateCommandQueue"); + Handle, "clCreateCommandQueue"); clCreateBufferFcnPtr = - (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer"); + (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer"); clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL( - HandleOpenCL, "clEnqueueWriteBuffer"); + Handle, "clEnqueueWriteBuffer"); + + if (HandleOpenCLBeignet) + clCreateProgramWithLLVMIntelFcnPtr = + (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL( + Handle, "clCreateProgramWithLLVMIntel"); clCreateProgramWithBinaryFcnPtr = (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL( - HandleOpenCL, "clCreateProgramWithBinary"); + Handle, "clCreateProgramWithBinary"); clBuildProgramFcnPtr = - (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram"); + (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram"); clCreateKernelFcnPtr = - (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel"); + (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel"); clSetKernelArgFcnPtr = - (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg"); + (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg"); clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL( - HandleOpenCL, "clEnqueueNDRangeKernel"); + Handle, "clEnqueueNDRangeKernel"); - clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL( - HandleOpenCL, "clEnqueueReadBuffer"); + clEnqueueReadBufferFcnPtr = + (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer"); - clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush"); + clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush"); - clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish"); + clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish"); clReleaseKernelFcnPtr = - (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel"); + (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel"); clReleaseProgramFcnPtr = - (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram"); + (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram"); - clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL( - HandleOpenCL, "clReleaseMemObject"); + clReleaseMemObjectFcnPtr = + (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject"); clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL( - HandleOpenCL, "clReleaseCommandQueue"); + Handle, "clReleaseCommandQueue"); clReleaseContextFcnPtr = - (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext"); + (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext"); return 1; } @@ -481,12 +500,34 @@ } cl_int Ret; - size_t BinarySize = strlen(BinaryBuffer); - ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr( - ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID, - (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL, - &Ret); - checkOpenCLError(Ret, "Failed to create program from binary.\n"); + + if (HandleOpenCLBeignet) { + // TODO: This is a workaround, since clCreateProgramWithLLVMIntel only + // accepts a filename to a valid llvm-ir file as an argument, instead + // of accepting the BinaryBuffer directly. + FILE *fp = fopen("kernel.ll", "wb"); + if (fp != NULL) { + fputs(BinaryBuffer, fp); + fclose(fp); + } + if (clCreateProgramWithLLVMIntelFcnPtr == NULL) + printf("What\n"); + + ((OpenCLKernel *)Function->Kernel)->Program = + clCreateProgramWithLLVMIntelFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->Context, 1, + &GlobalDeviceID, "kernel.ll", &Ret); + checkOpenCLError(Ret, "Failed to create program from llvm.\n"); + unlink("kernel.ll"); + } else { + size_t BinarySize = strlen(BinaryBuffer); + ((OpenCLKernel *)Function->Kernel)->Program = + clCreateProgramWithBinaryFcnPtr( + ((OpenCLContext *)GlobalContext->Context)->Context, 1, + &GlobalDeviceID, (const size_t *)&BinarySize, + (const unsigned char **)&BinaryBuffer, NULL, &Ret); + checkOpenCLError(Ret, "Failed to create program from binary.\n"); + } Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1, &GlobalDeviceID, NULL, NULL, NULL);