diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -576,6 +576,7 @@ "amdgpu_code_object_version", getTarget().getTargetOpts().CodeObjectVersion); } + getTargetCodeGenInfo().finalizeModule(TheModule); } emitLLVMUsed(); diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -247,6 +247,10 @@ llvm::StringRef Value, llvm::SmallString<32> &Opt) const {} + /// Clean up and other special handling at the end when all functions are + /// codegenerated. + virtual void finalizeModule(llvm::Module &M) const {} + /// Get LLVM calling convention for OpenCL kernel. virtual unsigned getOpenCLKernelCallingConv() const; diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -19,9 +19,9 @@ #include "CodeGenFunction.h" #include "clang/AST/Attr.h" #include "clang/AST/RecordLayout.h" +#include "clang/Basic/Builtins.h" #include "clang/Basic/CodeGenOptions.h" #include "clang/Basic/DiagnosticFrontend.h" -#include "clang/Basic/Builtins.h" #include "clang/CodeGen/CGFunctionInfo.h" #include "clang/CodeGen/SwiftCallingConv.h" #include "llvm/ADT/SmallBitVector.h" @@ -34,6 +34,7 @@ #include "llvm/IR/IntrinsicsS390.h" #include "llvm/IR/Type.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" #include // std::sort using namespace clang; @@ -9217,6 +9218,7 @@ llvm::Value *BlockLiteral) const override; bool shouldEmitStaticExternCAliases() const override; void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; + void finalizeModule(llvm::Module &M) const override; }; } @@ -9233,6 +9235,26 @@ cast(D)->getType()->isCUDADeviceBuiltinTextureType())); } +static llvm::Function *getKernelClone(llvm::Function &F) { + llvm::Module *M = F.getParent(); + SmallString<128> MangledName("__amdgpu_"); + MangledName.append(F.getName()); + MangledName.append("_kernel_body"); + llvm::Function *NewF = M->getFunction(MangledName); + if (!NewF) { + llvm::ValueToValueMapTy ignored; + NewF = F.empty() + ? llvm::Function::Create(F.getFunctionType(), + llvm::GlobalVariable::ExternalLinkage, + "", M) + : CloneFunction(&F, ignored); + NewF->setCallingConv(llvm::CallingConv::C); + NewF->setName(MangledName); + } + + return NewF; +} + void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const { const auto *ReqdWGS = @@ -9435,6 +9457,30 @@ FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel)); } +void AMDGPUTargetCodeGenInfo::finalizeModule(llvm::Module &M) const { + // Insert a cloned function body for each kernel and adjust the kernel + // callsite to use its equivalent clone function. For extern kernel calls, + // insert a declaration node since the body isn't available. + if (!getABIInfo().getContext().getLangOpts().OpenCL) + return; + + for (auto &F : M) { + if (F.getCallingConv() != llvm::CallingConv::AMDGPU_KERNEL) + continue; + + llvm::Function *Clone = getKernelClone(F); + for (llvm::Function::user_iterator UI = F.user_begin(), UE = F.user_end(); + UI != UE;) { + auto *CI = dyn_cast(*UI++); + if (!CI) + continue; + + CI->setCalledFunction(Clone); + CI->setCallingConv(llvm::CallingConv::C); + } + } +} + //===----------------------------------------------------------------------===// // SPARC v8 ABI Implementation. // Based on the SPARC Compliance Definition version 2.4.1. diff --git a/clang/test/CodeGenOpenCL/amdgpu-kernel-calls.cl b/clang/test/CodeGenOpenCL/amdgpu-kernel-calls.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/amdgpu-kernel-calls.cl @@ -0,0 +1,60 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s + +// AMDGPU disallows kernel callsites from another kernels. For each kernel, clang codegen will introduce +// a cloned function body with a non-kernel calling convention and amdgpu_kernel callsites will get +// transformed to call appropriate clones. + +extern kernel void test_extern_kernel_callee(global int *in); + +// CHECK: define dso_local amdgpu_kernel void @test_kernel_callee(i32 addrspace(1)* noundef align 4 %in) +// CHECK-NEXT: entry: +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8, addrspace(5) +// CHECK-NEXT: store i32 addrspace(1)* [[IN:%.*]], i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: store i32 10, i32 addrspace(1)* [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +kernel void test_kernel_callee(global int *in) { + *in = (int)(10); +} + +// CHECK: define dso_local amdgpu_kernel void @test_kernel_caller(i32 addrspace(1)* noundef align 4 %in) +// CHECK-NEXT: entry: +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8, addrspace(5) +// CHECK-NEXT: store i32 addrspace(1)* [[IN:%.*]], i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: %{{.*}} = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: call void @__amdgpu_test_kernel_callee_kernel_body( +// CHECK-NOT: call amdgpu_kernel void @test_kernel_callee( +// CHECK-NEXT: %{{.*}} = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: call void @__amdgpu_test_extern_kernel_callee_kernel_body( +// CHECK-NOT: call amdgpu_kernel void @test_kernel_callee( +// CHECK-NEXT: ret void +// +kernel void test_kernel_caller(global int *in) { + test_kernel_callee(in); + test_extern_kernel_callee(in); +} + +// CHECK: declare amdgpu_kernel void @test_extern_kernel_callee(i32 addrspace(1)* noundef align 4) + +// CHECK: define dso_local void @__amdgpu_test_kernel_callee_kernel_body(i32 addrspace(1)* noundef align 4 %in) +// CHECK-NEXT: entry: +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8, addrspace(5) +// CHECK-NEXT: store i32 addrspace(1)* %in, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: store i32 10, i32 addrspace(1)* [[TMP0]], align 4 +// CHECK-NEXT: ret void + +// CHECK: define dso_local void @__amdgpu_test_kernel_caller_kernel_body(i32 addrspace(1)* noundef align 4 %in) +// CHECK-NEXT: entry: +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8, addrspace(5) +// CHECK-NEXT: store i32 addrspace(1)* [[IN:%.*]], i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: %{{.*}} = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: call void @__amdgpu_test_kernel_callee_kernel_body( +// CHECK-NEXT: %{{.*}} = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: call void @__amdgpu_test_extern_kernel_callee_kernel_body( +// CHECK-NEXT: ret void +// + +// CHECK: declare void @__amdgpu_test_extern_kernel_callee_kernel_body(i32 addrspace(1)*) diff --git a/clang/test/CodeGenOpenCL/visibility.cl b/clang/test/CodeGenOpenCL/visibility.cl --- a/clang/test/CodeGenOpenCL/visibility.cl +++ b/clang/test/CodeGenOpenCL/visibility.cl @@ -94,23 +94,6 @@ ext_func_default(); } -// FVIS-DEFAULT: declare amdgpu_kernel void @ext_kern() -// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern() -// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern() - -// FVIS-DEFAULT: declare protected amdgpu_kernel void @ext_kern_hidden() -// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern_hidden() -// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern_hidden() - -// FVIS-DEFAULT: declare protected amdgpu_kernel void @ext_kern_protected() -// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern_protected() -// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern_protected() - -// FVIS-DEFAULT: declare amdgpu_kernel void @ext_kern_default() -// FVIS-PROTECTED: declare amdgpu_kernel void @ext_kern_default() -// FVIS-HIDDEN: declare amdgpu_kernel void @ext_kern_default() - - // FVIS-DEFAULT: declare void @ext_func() // FVIS-PROTECTED: declare protected void @ext_func() // FVIS-HIDDEN: declare hidden void @ext_func() @@ -126,3 +109,21 @@ // FVIS-DEFAULT: declare void @ext_func_default() // FVIS-PROTECTED: declare void @ext_func_default() // FVIS-HIDDEN: declare void @ext_func_default() + +// A kernel call will be emitted as a call to its cloned function +// of non-kernel convention. +// FVIS-DEFAULT: declare void @__amdgpu_ext_kern_kernel_body() +// FVIS-PROTECTED: declare void @__amdgpu_ext_kern_kernel_body() +// FVIS-HIDDEN: declare void @__amdgpu_ext_kern_kernel_body() + +// FVIS-DEFAULT: declare void @__amdgpu_ext_kern_hidden_kernel_body() +// FVIS-PROTECTED: declare void @__amdgpu_ext_kern_hidden_kernel_body() +// FVIS-HIDDEN: declare void @__amdgpu_ext_kern_hidden_kernel_body() + +// FVIS-DEFAULT: declare void @__amdgpu_ext_kern_protected_kernel_body() +// FVIS-PROTECTED: declare void @__amdgpu_ext_kern_protected_kernel_body() +// FVIS-HIDDEN: declare void @__amdgpu_ext_kern_protected_kernel_body() + +// FVIS-DEFAULT: declare void @__amdgpu_ext_kern_default_kernel_body() +// FVIS-PROTECTED: declare void @__amdgpu_ext_kern_default_kernel_body() +// FVIS-HIDDEN: declare void @__amdgpu_ext_kern_default_kernel_body()