Index: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp +++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp @@ -255,8 +255,12 @@ /// /// @param Kernel The kernel to scan for llvm::Values /// - /// @returns A set of values referenced by the kernel. - SetVector getReferencesInKernel(ppcg_kernel *Kernel); + /// @returns A pair, whose first element contains the set of values + /// referenced by the kernel, and whose second element contains the + /// set of functions referenced by the kernel. All functions in the + /// second set satisfy isValidFunctionInKernel. + std::pair, SetVector> + getReferencesInKernel(ppcg_kernel *Kernel); /// Compute the sizes of the execution grid for a given kernel. /// @@ -365,8 +369,11 @@ /// /// @param Kernel The kernel to generate code for. /// @param SubtreeValues The set of llvm::Values referenced by this kernel. + /// @param SubtreeFunctions The set of llvm::Functions referenced by this + /// kernel. void createKernelFunction(ppcg_kernel *Kernel, - SetVector &SubtreeValues); + SetVector &SubtreeValues, + SetVector &SubtreeFunctions); /// Create the declaration of a kernel function. /// @@ -389,6 +396,25 @@ /// @param The kernel to generate the intrinsic functions for. void insertKernelIntrinsics(ppcg_kernel *Kernel); + /// Setup the creation of functions referenced by the GPU kernel. + /// + /// 1. Create new function declarations in GPUModule which are the same as + /// SubtreeFunctions. + /// + /// 2. Populate IslNodeBuilder::ValueMap with mappings from + /// old functions (that come from the original module) to new functions + /// (that are created within GPUModule). That way, we generate references + /// to the correct function (in GPUModule) in BlockGenerator. + /// + /// @see IslNodeBuilder::ValueMap + /// @see BlockGenerator::GlobalMap + /// @see BlockGenerator::getNewValue + /// @see GPUNodeBuilder::getReferencesInKernel. + /// + /// @param SubtreeFunctions The set of llvm::Functions referenced by + /// this kernel. + void setupKernelSubtreeFunctions(SetVector SubtreeFunctions); + /// Create a global-to-shared or shared-to-global copy statement. /// /// @param CopyStmt The copy statement to generate code for @@ -1109,7 +1135,40 @@ return isl_bool_true; } -SetVector GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { +/// Check if F is a function that we can code-generate in a GPU kernel. +static bool isValidFunctionInKernel(llvm::Function *F) { + assert(F && "F is an invalid pointer"); + // We string compare against the name of the function to allow + // all variants of the intrinsic "llvm.sqrt.*" + return F->isIntrinsic() && F->getName().startswith("llvm.sqrt"); +} + +/// Do not take `Function` as a subtree value. +/// +/// We try to take the reference of all subtree values and pass them along +/// to the kernel from the host. Taking an address of any function and +/// trying to pass along is nonsensical. Only allow `Value`s that are not +/// `Function`s. +static bool isValidSubtreeValue(llvm::Value *V) { return !isa(V); } + +/// Return `Function`s from `RawSubtreeValues`. +static SetVector +getFunctionsFromRawSubtreeValues(SetVector RawSubtreeValues) { + SetVector SubtreeFunctions; + for (Value *It : RawSubtreeValues) { + Function *F = dyn_cast(It); + if (F) { + assert(isValidFunctionInKernel(F) && "Code should have bailed out by " + "this point if an invalid function " + "were present in a kernel."); + SubtreeFunctions.insert(F); + } + } + return SubtreeFunctions; +} + +std::pair, SetVector> +GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { SetVector SubtreeValues; SetVector SCEVs; SetVector Loops; @@ -1146,7 +1205,19 @@ isl_id_free(Id); } - return SubtreeValues; + // Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions + // SubtreeValues. This is important, because we should not lose any + // SubtreeValues in the process of constructing the + // "ValidSubtree{Values, Functions} sets. Nor should the set + // ValidSubtree{Values, Functions} have any common element. + auto ValidSubtreeValuesIt = + make_filter_range(SubtreeValues, isValidSubtreeValue); + SetVector ValidSubtreeValues(ValidSubtreeValuesIt.begin(), + ValidSubtreeValuesIt.end()); + SetVector ValidSubtreeFunctions( + getFunctionsFromRawSubtreeValues(SubtreeValues)); + + return std::make_pair(ValidSubtreeValues, ValidSubtreeFunctions); } void GPUNodeBuilder::clearDominators(Function *F) { @@ -1353,6 +1424,21 @@ Launch + "_params_i8ptr", Location); } +void GPUNodeBuilder::setupKernelSubtreeFunctions( + SetVector SubtreeFunctions) { + for (auto Fn : SubtreeFunctions) { + const std::string ClonedFnName = Fn->getName(); + Function *Clone = GPUModule->getFunction(ClonedFnName); + if (!Clone) + Clone = + Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage, + ClonedFnName, GPUModule.get()); + assert(Clone && "Expected cloned function to be initialized."); + assert(ValueMap.find(Fn) == ValueMap.end() && + "Fn already present in ValueMap"); + ValueMap[Fn] = Clone; + } +} void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { isl_id *Id = isl_ast_node_get_annotation(KernelStmt); ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); @@ -1369,7 +1455,9 @@ Value *BlockDimX, *BlockDimY, *BlockDimZ; std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); - SetVector SubtreeValues = getReferencesInKernel(Kernel); + SetVector SubtreeValues; + SetVector SubtreeFunctions; + std::tie(SubtreeValues, SubtreeFunctions) = getReferencesInKernel(Kernel); assert(Kernel->tree && "Device AST of kernel node is empty"); @@ -1393,7 +1481,8 @@ SubtreeValues.insert(V); } - createKernelFunction(Kernel, SubtreeValues); + createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions); + setupKernelSubtreeFunctions(SubtreeFunctions); create(isl_ast_node_copy(Kernel->tree)); @@ -1721,8 +1810,9 @@ } } -void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, - SetVector &SubtreeValues) { +void GPUNodeBuilder::createKernelFunction( + ppcg_kernel *Kernel, SetVector &SubtreeValues, + SetVector &SubtreeFunctions) { std::string Identifier = "kernel_" + std::to_string(Kernel->id); GPUModule.reset(new Module(Identifier, Builder.getContext())); @@ -2611,9 +2701,18 @@ return isl_ast_expr_ge(Iterations, MinComputeExpr); } - /// Check whether the Block contains any Function value. - bool ContainsFnPtrValInBlock(const BasicBlock *BB) { - for (const Instruction &Inst : *BB) + /// Check if the basic block contains a function we cannot codegen for GPU + /// kernels. + /// + /// If this basic block does something with a `Function` other than calling + /// a function that we support in a kernel, return true. + bool containsInvalidKernelFunctionInBllock(const BasicBlock *BB) { + for (const Instruction &Inst : *BB) { + const CallInst *Call = dyn_cast(&Inst); + if (Call && isValidFunctionInKernel(Call->getCalledFunction())) { + continue; + } + for (Value *SrcVal : Inst.operands()) { PointerType *p = dyn_cast(SrcVal->getType()); if (!p) @@ -2621,20 +2720,21 @@ if (isa(p->getElementType())) return true; } + } return false; } - /// Return whether the Scop S has functions. - bool ContainsFnPtr(const Scop &S) { + /// Return whether the Scop S uses functions in a way that we do not support. + bool containsInvalidKernelFunction(const Scop &S) { for (auto &Stmt : S) { if (Stmt.isBlockStmt()) { - if (ContainsFnPtrValInBlock(Stmt.getBasicBlock())) + if (containsInvalidKernelFunctionInBllock(Stmt.getBasicBlock())) return true; } else { assert(Stmt.isRegionStmt() && "Stmt was neither block nor region statement"); for (const BasicBlock *BB : Stmt.getRegion()->blocks()) - if (ContainsFnPtrValInBlock(BB)) + if (containsInvalidKernelFunctionInBllock(BB)) return true; } } @@ -2708,13 +2808,18 @@ DL = &S->getRegion().getEntry()->getModule()->getDataLayout(); RI = &getAnalysis().getRegionInfo(); - // We currently do not support functions inside kernels, as code - // generation will need to offload function calls to the kernel. - // This may lead to a kernel trying to call a function on the host. + // We currently do not support functions other than intrinsics inside + // kernels, as code generation will need to offload function calls to the + // kernel. This may lead to a kernel trying to call a function on the host. // This also allows us to prevent codegen from trying to take the // address of an intrinsic function to send to the kernel. - if (ContainsFnPtr(CurrentScop)) + if (containsInvalidKernelFunction(CurrentScop)) { + DEBUG( + dbgs() + << "Scop contains function which cannot be materialised in a GPU " + "kernel. Bailing out.\n";); return false; + } auto PPCGScop = createPPCGScop(); auto PPCGProg = createPPCGProg(PPCGScop); Index: polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll =================================================================== --- polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll +++ polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll @@ -0,0 +1,66 @@ +; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadPolly -analyze -polly-codegen-ppcg -polly-acc-dump-kernel-ir < %s | FileCheck %s --check-prefix=KERNEL-IR +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s --check-prefix=HOST-IR + +; Test that we do recognise and codegen a kernel that has intrinsics. + +; REQUIRES: pollyacc + +; Check that we model the kernel as a scop. +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end + +; Check that the intrinsic call is present in the kernel IR. +; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_) +; KERNEL-IR: declare float @llvm.sqrt.f32(float) #2 + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + + +; void f(float *A, float *B, int N) { +; for(int i = 0; i < N; i++) { +; B[i] = sqrt(A[i]); +; } +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(float* %A, float* %B, i32 %N) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %cmp1 = icmp sgt i32 %N, 0 + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv + %A.arr.i.val = load float, float* %A.arr.i, align 4 + ; Call to intrinsic that should be part of the kernel. + %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val) + %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv + store float %sqrt, float* %B.arr.i, align 4 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %N to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.sqrt.f32(float) #0 + +attributes #0 = { nounwind readnone } +