Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -256,7 +256,8 @@ /// @param Kernel The kernel to scan for llvm::Values /// /// @returns A set of values referenced by the kernel. - SetVector getReferencesInKernel(ppcg_kernel *Kernel); + std::pair, SetVector> + getReferencesInKernel(ppcg_kernel *Kernel); /// Compute the sizes of the execution grid for a given kernel. /// @@ -366,7 +367,8 @@ /// @param Kernel The kernel to generate code for. /// @param SubtreeValues The set of llvm::Values referenced by this kernel. void createKernelFunction(ppcg_kernel *Kernel, - SetVector &SubtreeValues); + SetVector &SubtreeValues, + SetVector &SubtreeFunctions); /// Create the declaration of a kernel function. /// @@ -389,6 +391,9 @@ /// @param The kernel to generate the intrinsic functions for. void insertKernelIntrinsics(ppcg_kernel *Kernel); + void replaceKernelSubtreeFunctions(Function *KernelFunction, + SetVector &SubtreeFunctions); + /// Create a global-to-shared or shared-to-global copy statement. /// /// @param CopyStmt The copy statement to generate code for @@ -1109,7 +1114,40 @@ return isl_bool_true; } -SetVector GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { +// By this point, we have only allowed "safe" function calls which we know +// will be lowered. For now, we only allow intrinsics within kernel functions +// TODO: is this too lax? We maybe allowing intrinsics that cannot be lowered. +static bool IsValidFunctionInKernel(llvm::Function *F) { + assert(F && "F is a invalid pointer"); + return F->isIntrinsic(); +} + +// Do not take `Function` as a subtree value. +// We try to take the reference of all subtree values and pass them along +// to the kernel from the host. Taking an address of any function and +// trying to pass along is nonsensical. Only allow `Value`s that are not +// `Function`s. +static bool isValidSubtreeValue(llvm::Value *V) { return !isa(V); } + +// TODO: Consider converting to filter + map. Not sure how to use the +// `map_iterator` in llvm's STLExtras. +static SetVector +getFunctionsFromRawSubtreeValues(SetVector RawSubtreeValues) { + SetVector SubtreeFunctions; + for (Value *It : RawSubtreeValues) { + Function *F = dyn_cast(It); + if (F) { + assert(IsValidFunctionInKernel(F) && "Code should have bailed out by " + "this point if an invalid function " + "were present in a kernel"); + SubtreeFunctions.insert(F); + } + } + return SubtreeFunctions; +} + +std::pair, SetVector> +GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { SetVector SubtreeValues; SetVector SCEVs; SetVector Loops; @@ -1146,7 +1184,17 @@ isl_id_free(Id); } - return SubtreeValues; + // TODO: Consider creating both SubtreeValue and SubtreeFunction in one + // loop since they are mutually exclusive? This makes it harder to read. + auto ValidSubtreeValuesIt = + make_filter_range(SubtreeValues, isValidSubtreeValue); + SetVector ValidSubtreeValues(ValidSubtreeValuesIt.begin(), + ValidSubtreeValuesIt.end()); + + SetVector ValidSubtreeFunctions( + getFunctionsFromRawSubtreeValues(SubtreeValues)); + + return std::make_pair(ValidSubtreeValues, ValidSubtreeFunctions); } void GPUNodeBuilder::clearDominators(Function *F) { @@ -1369,7 +1417,9 @@ Value *BlockDimX, *BlockDimY, *BlockDimZ; std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); - SetVector SubtreeValues = getReferencesInKernel(Kernel); + SetVector SubtreeValues; + SetVector SubtreeFunctions; + std::tie(SubtreeValues, SubtreeFunctions) = getReferencesInKernel(Kernel); assert(Kernel->tree && "Device AST of kernel node is empty"); @@ -1393,13 +1443,14 @@ SubtreeValues.insert(V); } - createKernelFunction(Kernel, SubtreeValues); + createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions); create(isl_ast_node_copy(Kernel->tree)); finalizeKernelArguments(Kernel); Function *F = Builder.GetInsertBlock()->getParent(); addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); + replaceKernelSubtreeFunctions(F, SubtreeFunctions); clearDominators(F); clearScalarEvolution(F); clearLoops(F); @@ -1561,6 +1612,45 @@ return FN; } +// can access state GPUNodeBuilder::GPUModle +void GPUNodeBuilder::replaceKernelSubtreeFunctions( + Function *KernelFunction, SetVector &SubtreeFunctions) { + Module *M = GPUModule.get(); + + StringMap ClonedFunctions; + + // Step 1: insert the functions referenced by the kernel in the GPU module + for (auto Fn : SubtreeFunctions) { + const std::string ClonedFnName = Fn->getName(); + Function *Clone = M->getFunction(ClonedFnName); + if (!Clone) + Clone = Function::Create(Fn->getFunctionType(), + GlobalValue::ExternalLinkage, ClonedFnName, M); + assert(Clone && "Expected cloned function to be initialized."); + ClonedFunctions[ClonedFnName] = Clone; + } + + // Step 2: replace all call() instructions to refer to the new function + for (BasicBlock &BB : *KernelFunction) { + for (Instruction &Inst : BB) { + + CallInst *Call = dyn_cast(&Inst); + if (!Call) + continue; + + // Note that not all `call` instructions are calling external functions. + // Some of them are intrinsics inserted by us. So, only replace those + // functions which are known to be external. + auto It = ClonedFunctions.find(Call->getCalledFunction()->getName()); + if (It == ClonedFunctions.end()) + continue; + Function *Replacement = It->getValue(); + assert(Replacement && "did not get a valid Function from iterator"); + Call->setCalledFunction(Replacement); + } + } +} + void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { Intrinsic::ID IntrinsicsBID[2]; Intrinsic::ID IntrinsicsTID[3]; @@ -1721,8 +1811,9 @@ } } -void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, - SetVector &SubtreeValues) { +void GPUNodeBuilder::createKernelFunction( + ppcg_kernel *Kernel, SetVector &SubtreeValues, + SetVector &SubtreeFunctions) { std::string Identifier = "kernel_" + std::to_string(Kernel->id); GPUModule.reset(new Module(Identifier, Builder.getContext())); @@ -2611,9 +2702,15 @@ return isl_ast_expr_ge(Iterations, MinComputeExpr); } - /// Check whether the Block contains any Function value. - bool ContainsFnPtrValInBlock(const BasicBlock *BB) { - for (const Instruction &Inst : *BB) + // If this basic block does something with a `Function` other than calling + // a function that we support in a kernel, return true. + bool ContainsInvalidKernelFunctionInBlock(const BasicBlock *BB) { + for (const Instruction &Inst : *BB) { + const CallInst *Call = dyn_cast(&Inst); + if (Call && IsValidFunctionInKernel(Call->getCalledFunction())) { + continue; + } + for (Value *SrcVal : Inst.operands()) { PointerType *p = dyn_cast(SrcVal->getType()); if (!p) @@ -2621,20 +2718,21 @@ if (isa(p->getElementType())) return true; } + } return false; } - /// Return whether the Scop S has functions. - bool ContainsFnPtr(const Scop &S) { + /// Return whether the Scop S uses functions in a way that we do not support. + bool ContainsInvalidKernelFunction(const Scop &S) { for (auto &Stmt : S) { if (Stmt.isBlockStmt()) { - if (ContainsFnPtrValInBlock(Stmt.getBasicBlock())) + if (ContainsInvalidKernelFunctionInBlock(Stmt.getBasicBlock())) return true; } else { assert(Stmt.isRegionStmt() && "Stmt was neither block nor region statement"); for (const BasicBlock *BB : Stmt.getRegion()->blocks()) - if (ContainsFnPtrValInBlock(BB)) + if (ContainsInvalidKernelFunctionInBlock(BB)) return true; } } @@ -2711,13 +2809,18 @@ if (S->hasInvariantAccesses()) return false; - // We currently do not support functions inside kernels, as code - // generation will need to offload function calls to the kernel. - // This may lead to a kernel trying to call a function on the host. + // We currently do not support functions other than intrinsics inside + // kernels, as code generation will need to offload function calls to the + // kernel. This may lead to a kernel trying to call a function on the host. // This also allows us to prevent codegen from trying to take the // address of an intrinsic function to send to the kernel. - if (ContainsFnPtr(CurrentScop)) + if (ContainsInvalidKernelFunction(CurrentScop)) { + DEBUG( + dbgs() + << "Scop contains function which cannot be materialised in a GPU " + "kernel. bailing out.\n";); return false; + } auto PPCGScop = createPPCGScop(); auto PPCGProg = createPPCGProg(PPCGScop); Index: test/GPGPU/intrinsic-copied-into-kernel.ll =================================================================== --- /dev/null +++ test/GPGPU/intrinsic-copied-into-kernel.ll @@ -0,0 +1,64 @@ +; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadPolly -analyze -polly-codegen-ppcg -polly-acc-dump-kernel-ir < %s | FileCheck %s --check-prefix=KERNEL-IR +; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s --check-prefix=HOST-IR + +; Test that we do recognise and codegen a kernel that has intrinsics. + +; Check that we model the kernel as a scop. +; SCOP: Function: f +; SCOP-NEXT: Region: %entry.split---%for.end + +; Check that the intrinsic call is present in the kernel IR. +; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_) +; KERNEL-IR: declare float @llvm.sqrt.f32(float) #2 + +; Check that kernel launch is generated in host IR. +; the declare would not be generated unless a call to a kernel exists. +; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) + + +; void f(float *A, float *B, int N) { +; for(int i = 0; i < N; i++) { +; B[i] = sqrt(A[i]); +; } +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(float* %A, float* %B, i32 %N) { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %cmp1 = icmp sgt i32 %N, 0 + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry.split + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv + %A.arr.i.val = load float, float* %A.arr.i, align 4 + ; Call to intrinsic that should be part of the kernel. + %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val) + %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv + store float %sqrt, float* %B.arr.i, align 4 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %N to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.sqrt.f32(float) #0 + +attributes #0 = { nounwind readnone } +