Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -188,6 +188,10 @@ ModulePass *createR600OpenCLImageTypeLoweringPass(); FunctionPass *createAMDGPUAnnotateUniformValues(); +ModulePass *createAMDGPULowerKernelCallsPass(); +void initializeAMDGPULowerKernelCallsPass(PassRegistry&); +extern char &AMDGPULowerKernelCallsID; + ModulePass *createAMDGPUPrintfRuntimeBinding(); void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&); extern char &AMDGPUPrintfRuntimeBindingID; Index: lib/Target/AMDGPU/AMDGPULowerKernelCalls.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPULowerKernelCalls.cpp @@ -0,0 +1,112 @@ +//===-- AMDGPULowerKernelCalls.cpp - Fix kernel calls ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// \file +// +// \brief replace calls to OpenCL kernels with equivalent non-kernel +// functions +// +// In OpenCL, a kernel may call another kernel as if it was a non-kernel +// function. However, kernels and functions have different ABI. To fix this, +// we copy the body of kernel A into a new non-kernel function fA, if we +// encounter a call to A. All calls to A are then transferred to fA. +// +//===----------------------------------------------------------------------===// +#include "AMDGPU.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +namespace { +class AMDGPULowerKernelCalls : public ModulePass { +public: + static char ID; + explicit AMDGPULowerKernelCalls(); + + StringRef getPassName() const override { + return "AMDGPU Lower Kernel Calls"; + } + +private: + bool runOnModule(Module &M) override; +}; +} // end anonymous namespace + +char AMDGPULowerKernelCalls::ID = 0; + +namespace llvm { +void initializeAMDGPULowerKernelCallsPass(PassRegistry &); + +ModulePass *createAMDGPULowerKernelCallsPass() { + return new AMDGPULowerKernelCalls(); +} +} + +char &llvm::AMDGPULowerKernelCallsID = AMDGPULowerKernelCalls::ID; + +INITIALIZE_PASS( + AMDGPULowerKernelCalls, "amdgpu-lower-kernel-calls", + "Lower calls to kernel functions into non-kernel function calls.", false, + false) + +AMDGPULowerKernelCalls::AMDGPULowerKernelCalls() : ModulePass(ID) { + initializeAMDGPULowerKernelCallsPass(*PassRegistry::getPassRegistry()); +} + +static void setNameForBody(Function *FBody, const Function &FKernel) { + StringRef Name = FKernel.getName(); + SmallString<128> NewName("__amdgpu_"); + NewName += Name; + NewName += "_kernel_body"; + + FBody->setName(NewName.str()); +} + +static Function *cloneKernel(Function &F) { + ValueToValueMapTy ignored; + Function *NewF = F.empty() + ? Function::Create( + F.getFunctionType(), Function::ExternalLinkage, "", + F.getParent()) + : CloneFunction(&F, ignored); + NewF->setCallingConv(CallingConv::C); + // If we are copying a definition, we know there are no external references + // and we can force internal linkage. + if (!NewF->isDeclaration()) { + NewF->setVisibility(GlobalValue::DefaultVisibility); + NewF->setLinkage(GlobalValue::InternalLinkage); + } + setNameForBody(NewF, F); + return NewF; +} + +bool AMDGPULowerKernelCalls::runOnModule(Module &M) { + bool Changed = false; + for (auto &F : M) { + if (CallingConv::AMDGPU_KERNEL != F.getCallingConv()) + continue; + Function *FBody = NULL; + for (Function::user_iterator UI = F.user_begin(), UE = F.user_end(); + UI != UE;) { + CallInst *CI = dyn_cast(*UI++); + if (!CI) + continue; + if (!FBody) + FBody = cloneKernel(F); + CI->setCalledFunction(FBody); + CI->setCallingConv(CallingConv::C); + Changed = true; + } + } + + return Changed; +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -239,6 +239,7 @@ initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUInlinerPass(*PR); initializeAMDGPUPrintfRuntimeBindingPass(*PR); + initializeAMDGPULowerKernelCallsPass(*PR); initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); } @@ -667,6 +668,8 @@ // bitcast calls. addPass(createAMDGPUFixFunctionBitcastsPass()); + // this pass should be performed on linked module + addPass(createAMDGPULowerKernelCallsPass()); // A call to propagate attributes pass in the backend in case opt was not run. addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -51,6 +51,7 @@ AMDGPULowerIntrinsics.cpp AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp + AMDGPULowerKernelCalls.cpp AMDGPUMachineCFGStructurizer.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp Index: test/CodeGen/AMDGPU/call-to-kernel-undefined.ll =================================================================== --- test/CodeGen/AMDGPU/call-to-kernel-undefined.ll +++ test/CodeGen/AMDGPU/call-to-kernel-undefined.ll @@ -1,16 +1,21 @@ -; RUN: not llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN %s -; FIXME: It should be invalid IR to have a call to a kernel, but this -; is currently relied on, but should be eliminated before codegen. +; GCN: callee_kernel: +; GCN: s_endpgm +; GCN: __amdgpu_callee_kernel_kernel_body +; GCN: s_setpc_b64 define amdgpu_kernel void @callee_kernel(i32 addrspace(1)* %out) #0 { entry: store volatile i32 0, i32 addrspace(1)* %out ret void } -; Make sure there's no crash when the callsite calling convention -; doesn't match. -; CHECK: LLVM ERROR: invalid call to entry function +; GCN: caller_kernel: +; GCN: s_getpc_b64 s{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]] +; GCN: s_add_u32 s[[LO2:[0-9]+]], s[[LO1]], __amdgpu_callee_kernel_kernel_body@rel32@lo+4 +; GCN: s_addc_u32 s[[HI2:[0-9]+]], s[[HI1]], __amdgpu_callee_kernel_kernel_body@rel32@hi+4 +; GCN: s_swappc_b64 s[{{[0-9:]+}}], s{{\[}}[[LO2]]:[[HI2]]] +; GCN: s_endpgm define amdgpu_kernel void @caller_kernel(i32 addrspace(1)* %out) #0 { entry: call void @callee_kernel(i32 addrspace(1)* %out) Index: test/CodeGen/AMDGPU/call-to-kernel.ll =================================================================== --- test/CodeGen/AMDGPU/call-to-kernel.ll +++ test/CodeGen/AMDGPU/call-to-kernel.ll @@ -1,14 +1,21 @@ -; RUN: not llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN %s -; FIXME: It should be invalid IR to have a call to a kernel, but this -; is currently relied on, but should be eliminated before codegen. +; GCN: callee_kernel: +; GCN: s_endpgm +; GCN: __amdgpu_callee_kernel_kernel_body +; GCN: s_setpc_b64 define amdgpu_kernel void @callee_kernel(i32 addrspace(1)* %out) #0 { entry: store volatile i32 0, i32 addrspace(1)* %out ret void } -; CHECK: LLVM ERROR: Unsupported calling convention for call +; GCN: caller_kernel: +; GCN: s_getpc_b64 s{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]] +; GCN: s_add_u32 s[[LO2:[0-9]+]], s[[LO1]], __amdgpu_callee_kernel_kernel_body@rel32@lo+4 +; GCN: s_addc_u32 s[[HI2:[0-9]+]], s[[HI1]], __amdgpu_callee_kernel_kernel_body@rel32@hi+4 +; GCN: s_swappc_b64 s[{{[0-9:]+}}], s{{\[}}[[LO2]]:[[HI2]]] +; GCN: s_endpgm define amdgpu_kernel void @caller_kernel(i32 addrspace(1)* %out) #0 { entry: call amdgpu_kernel void @callee_kernel(i32 addrspace(1)* %out)