Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -255,8 +255,13 @@ /// /// @param Kernel The kernel to scan for llvm::Values /// - /// @returns A set of values referenced by the kernel. - SetVector getReferencesInKernel(ppcg_kernel *Kernel); + /// @returns A pair, whose first element contains the set of + /// values referenced by the kernel, and whose + /// second element contains the set of functions + /// referenced by the kernel. All functions in the + /// second set satisfy isValidFunctionInKernel. + std::pair, SetVector> + getReferencesInKernel(ppcg_kernel *Kernel); /// Compute the sizes of the execution grid for a given kernel. /// @@ -366,7 +371,8 @@ /// @param Kernel The kernel to generate code for. /// @param SubtreeValues The set of llvm::Values referenced by this kernel. void createKernelFunction(ppcg_kernel *Kernel, - SetVector &SubtreeValues); + SetVector &SubtreeValues, + SetVector &SubtreeFunctions); /// Create the declaration of a kernel function. /// @@ -389,6 +395,24 @@ /// @param The kernel to generate the intrinsic functions for. void insertKernelIntrinsics(ppcg_kernel *Kernel); + /// Setup the creation of functions referenced by the GPU kernel. + /// + /// 1. Create new functions in GPUModule which are the same as + /// SubtreeFunctions. + /// + /// 2. Populate IslNodeBuilder::BlockGen::ValueMap with mappings from + /// old functions (that come from the original module) to new functions + /// (that are created within GPUModule). That way, we generate references + /// to the correct function (in GPUModule) in BlockGenerator. + /// + /// @see IslNodeBuilder::BlockGen::ValueMap + /// @see BlockGenerator::getNewValue + /// @see GPUNodeBuilder::getReferencesInKernel. + /// + /// @param SubtreeFunctions The set of llvm::Functions referenced by + /// this kernel. + void setupKernelSubtreeFunctions(SetVector SubtreeFunctions); + /// Create a global-to-shared or shared-to-global copy statement. /// /// @param CopyStmt The copy statement to generate code for @@ -1109,7 +1133,40 @@ return isl_bool_true; } -SetVector GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { +// By this point, we have only allowed "safe" function calls which we know +// will be lowered. For now, we only allow intrinsics within kernel functions +// TODO: is this too lax? We maybe allowing intrinsics that cannot be lowered. +static bool IsValidFunctionInKernel(llvm::Function *F) { + assert(F && "F is a invalid pointer"); + return F->getName().startswith("llvm.sqrt"); +} + +// Do not take `Function` as a subtree value. +// We try to take the reference of all subtree values and pass them along +// to the kernel from the host. Taking an address of any function and +// trying to pass along is nonsensical. Only allow `Value`s that are not +// `Function`s. +static bool isValidSubtreeValue(llvm::Value *V) { return !isa(V); } + +// TODO: Consider converting to filter + map. Not sure how to use the +// `map_iterator` in llvm's STLExtras. +static SetVector +getFunctionsFromRawSubtreeValues(SetVector RawSubtreeValues) { + SetVector SubtreeFunctions; + for (Value *It : RawSubtreeValues) { + Function *F = dyn_cast(It); + if (F) { + assert(IsValidFunctionInKernel(F) && "Code should have bailed out by " + "this point if an invalid function " + "were present in a kernel"); + SubtreeFunctions.insert(F); + } + } + return SubtreeFunctions; +} + +std::pair, SetVector> +GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { SetVector SubtreeValues; SetVector SCEVs; SetVector Loops; @@ -1146,7 +1203,34 @@ isl_id_free(Id); } - return SubtreeValues; + // TODO: Consider creating both SubtreeValue and SubtreeFunction in one + // loop since they are mutually exclusive? This makes it harder to read. + auto ValidSubtreeValuesIt = + make_filter_range(SubtreeValues, isValidSubtreeValue); + SetVector ValidSubtreeValues(ValidSubtreeValuesIt.begin(), + ValidSubtreeValuesIt.end()); + + SetVector ValidSubtreeFunctions( + getFunctionsFromRawSubtreeValues(SubtreeValues)); + +#ifndef NDEBUG + // Note that { ValidSubtreeValues, ValidSubtreeFunctions } partions + // SubtreeValues. We are checking for that property here. + + // Check that ValidSubtreeValues U ValidSubtreeFunctions == SubtreeValues + { + auto ValidSubtreeValuesUnion = SetVector( + ValidSubtreeValues.begin(), ValidSubtreeValues.end()); + ValidSubtreeValuesUnion.set_union(ValidSubtreeFunctions); + assert(ValidSubtreeValuesUnion == SubtreeValues); + } + +// TODO: Check ValidSubtreeValues intersection ValidSubtreeFunctions == empty. +// llvm::SetVector does not support set_intersect. +// llvm::set_intersect from llvm/ADT/SetOperations.h does not work over +// llvm::SetVector, unfortunately. +#endif + return std::make_pair(ValidSubtreeValues, ValidSubtreeFunctions); } void GPUNodeBuilder::clearDominators(Function *F) { @@ -1353,6 +1437,23 @@ Launch + "_params_i8ptr", Location); } +void GPUNodeBuilder::setupKernelSubtreeFunctions( + SetVector SubtreeFunctions) { + for (auto Fn : SubtreeFunctions) { + const std::string ClonedFnName = Fn->getName(); + Function *Clone = GPUModule->getFunction(ClonedFnName); + if (!Clone) + Clone = + Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage, + ClonedFnName, GPUModule.get()); + assert(Clone && "Expected cloned function to be initialized."); +#ifndef NDEBUG + assert(ValueMap.find(Fn) == ValueMap.end() && "Fn already present in" + " ValueMap"); +#endif + ValueMap[Fn] = Clone; + } +} void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { isl_id *Id = isl_ast_node_get_annotation(KernelStmt); ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); @@ -1369,7 +1470,9 @@ Value *BlockDimX, *BlockDimY, *BlockDimZ; std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); - SetVector SubtreeValues = getReferencesInKernel(Kernel); + SetVector SubtreeValues; + SetVector SubtreeFunctions; + std::tie(SubtreeValues, SubtreeFunctions) = getReferencesInKernel(Kernel); assert(Kernel->tree && "Device AST of kernel node is empty"); @@ -1393,7 +1496,8 @@ SubtreeValues.insert(V); } - createKernelFunction(Kernel, SubtreeValues); + createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions); + setupKernelSubtreeFunctions(SubtreeFunctions); create(isl_ast_node_copy(Kernel->tree)); @@ -1721,8 +1825,9 @@ } } -void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, - SetVector &SubtreeValues) { +void GPUNodeBuilder::createKernelFunction( + ppcg_kernel *Kernel, SetVector &SubtreeValues, + SetVector &SubtreeFunctions) { std::string Identifier = "kernel_" + std::to_string(Kernel->id); GPUModule.reset(new Module(Identifier, Builder.getContext())); @@ -2611,9 +2716,15 @@ return isl_ast_expr_ge(Iterations, MinComputeExpr); } - /// Check whether the Block contains any Function value. - bool ContainsFnPtrValInBlock(const BasicBlock *BB) { - for (const Instruction &Inst : *BB) + // If this basic block does something with a `Function` other than calling + // a function that we support in a kernel, return true. + bool ContainsInvalidKernelFunctionInBlock(const BasicBlock *BB) { + for (const Instruction &Inst : *BB) { + const CallInst *Call = dyn_cast(&Inst); + if (Call && IsValidFunctionInKernel(Call->getCalledFunction())) { + continue; + } + for (Value *SrcVal : Inst.operands()) { PointerType *p = dyn_cast(SrcVal->getType()); if (!p) @@ -2621,20 +2732,21 @@ if (isa(p->getElementType())) return true; } + } return false; } - /// Return whether the Scop S has functions. - bool ContainsFnPtr(const Scop &S) { + /// Return whether the Scop S uses functions in a way that we do not support. + bool ContainsInvalidKernelFunction(const Scop &S) { for (auto &Stmt : S) { if (Stmt.isBlockStmt()) { - if (ContainsFnPtrValInBlock(Stmt.getBasicBlock())) + if (ContainsInvalidKernelFunctionInBlock(Stmt.getBasicBlock())) return true; } else { assert(Stmt.isRegionStmt() && "Stmt was neither block nor region statement"); for (const BasicBlock *BB : Stmt.getRegion()->blocks()) - if (ContainsFnPtrValInBlock(BB)) + if (ContainsInvalidKernelFunctionInBlock(BB)) return true; } } @@ -2707,13 +2819,18 @@ DL = &S->getRegion().getEntry()->getModule()->getDataLayout(); RI = &getAnalysis().getRegionInfo(); - // We currently do not support functions inside kernels, as code - // generation will need to offload function calls to the kernel. - // This may lead to a kernel trying to call a function on the host. + // We currently do not support functions other than intrinsics inside + // kernels, as code generation will need to offload function calls to the + // kernel. This may lead to a kernel trying to call a function on the host. // This also allows us to prevent codegen from trying to take the // address of an intrinsic function to send to the kernel. - if (ContainsFnPtr(CurrentScop)) + if (ContainsInvalidKernelFunction(CurrentScop)) { + DEBUG( + dbgs() + << "Scop contains function which cannot be materialised in a GPU " + "kernel. bailing out.\n";); return false; + } auto PPCGScop = createPPCGScop(); auto PPCGProg = createPPCGProg(PPCGScop); Index: test/GPGPU/intrinsic-copied-into-kernel.ll =================================================================== --- /dev/null +++ test/GPGPU/intrinsic-copied-into-kernel.ll @@ -0,0 +1,64 @@ +; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadPolly -analyze -polly-codegen-ppcg -polly-acc-dump-kernel-ir < %s | FileCheck %s --check-prefix=KERNEL-IR +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s --check-prefix=HOST-IR + +; Test that we do recognise and codegen a kernel that has intrinsics. + +; Check that we model the kernel as a scop. +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end + +; Check that the intrinsic call is present in the kernel IR. +; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_) +; KERNEL-IR: declare float @llvm.sqrt.f32(float) #2 + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + + +; void f(float *A, float *B, int N) { +; for(int i = 0; i < N; i++) { +; B[i] = sqrt(A[i]); +; } +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(float* %A, float* %B, i32 %N) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %cmp1 = icmp sgt i32 %N, 0 + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv + %A.arr.i.val = load float, float* %A.arr.i, align 4 + ; Call to intrinsic that should be part of the kernel. + %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val) + %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv + store float %sqrt, float* %B.arr.i, align 4 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %N to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.sqrt.f32(float) #0 + +attributes #0 = { nounwind readnone } +