diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -3142,6 +3142,37 @@ a register allocator created spill location. + ".kind" string The kind of the kernel + with the following + values: + + "normal" + Regular kernels. + + "init" + These kernels must be + invoked after loading + the containing code + object and must + complete before any + normal and fini + kernels in the same + code object are + invoked. + + "fini" + These kernels must be + invoked before + unloading the + containing code object + and after all init and + normal kernels in the + same code object have + been invoked and + completed. + + If omitted, "normal" is + assumed. =================================== ============== ========= ================================ .. diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -114,6 +114,10 @@ void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &); extern char &AMDGPUFixFunctionBitcastsID; +ModulePass *createAMDGPUCtorDtorLoweringPass(); +void initializeAMDGPUCtorDtorLoweringPass(PassRegistry &); +extern char &AMDGPUCtorDtorLoweringID; + FunctionPass *createAMDGPULowerKernelArgumentsPass(); void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &); extern char &AMDGPULowerKernelArgumentsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp @@ -0,0 +1,95 @@ +//===-- AMDGPUCtorDtorLowering.cpp - Handle global ctors and dtors --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This pass creates a unified init and fini kernel with the required metadata +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-lower-ctor-dtor" + +namespace { +class AMDGPUCtorDtorLowering final : public ModulePass { + bool runOnModule(Module &M) override; + +public: + Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) { + StringRef InitOrFiniKernelName = "amdgcn.device.init"; + if (!IsCtor) + InitOrFiniKernelName = "amdgcn.device.fini"; + + Function *InitOrFiniKernel = Function::createWithDefaultAttr( + FunctionType::get(Type::getVoidTy(M.getContext()), false), + GlobalValue::InternalLinkage, 0, InitOrFiniKernelName, &M); + BasicBlock *InitOrFiniKernelBB = + BasicBlock::Create(M.getContext(), "", InitOrFiniKernel); + ReturnInst::Create(M.getContext(), InitOrFiniKernelBB); + + InitOrFiniKernel->setCallingConv(CallingConv::AMDGPU_KERNEL); + if (IsCtor) + InitOrFiniKernel->addFnAttr("device-init"); + else + InitOrFiniKernel->addFnAttr("device-fini"); + return InitOrFiniKernel; + } + + bool createInitOrFiniKernel(Module &M, GlobalVariable *GV, bool IsCtor) { + if (!GV) + return false; + ConstantArray *GA = cast(GV->getInitializer()); + if (GA->getNumOperands() == 0) + return false; + Function *InitOrFiniKernel = createInitOrFiniKernelFunction(M, IsCtor); + IRBuilder<> IRB(InitOrFiniKernel->getEntryBlock().getTerminator()); + for (Value *V : GA->operands()) { + auto *CS = cast(V); + if (Function *F = dyn_cast(CS->getOperand(1))) { + FunctionCallee Ctor = + M.getOrInsertFunction(F->getName(), IRB.getVoidTy()); + IRB.CreateCall(Ctor); + } + } + appendToUsed(M, {InitOrFiniKernel}); + return true; + } + + static char ID; + AMDGPUCtorDtorLowering() : ModulePass(ID) {} +}; +} // End anonymous namespace + +char AMDGPUCtorDtorLowering::ID = 0; +char &llvm::AMDGPUCtorDtorLoweringID = AMDGPUCtorDtorLowering::ID; +INITIALIZE_PASS(AMDGPUCtorDtorLowering, DEBUG_TYPE, + "Lower ctors and dtors for AMDGPU", false, false) + +ModulePass *llvm::createAMDGPUCtorDtorLoweringPass() { + return new AMDGPUCtorDtorLowering(); +} + +bool AMDGPUCtorDtorLowering::runOnModule(Module &M) { + bool Modified = false; + Modified |= + createInitOrFiniKernel(M, M.getGlobalVariable("llvm.global_ctors"), + /*IsCtor =*/true); + Modified |= + createInitOrFiniKernel(M, M.getGlobalVariable("llvm.global_dtors"), + /*IsCtor =*/false); + return Modified; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -665,6 +665,10 @@ Func.getFnAttribute("runtime-handle").getValueAsString().str(), /*Copy=*/true); } + if (Func.hasFnAttribute("device-init")) + Kern[".kind"] = Kern.getDocument()->getNode("init"); + else if (Func.hasFnAttribute("device-fini")) + Kern[".kind"] = Kern.getDocument()->getNode("fini"); } void MetadataStreamerV3::emitKernelArgs(const Function &Func, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -349,6 +349,7 @@ initializeSIOptimizeVGPRLiveRangePass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUFixFunctionBitcastsPass(*PR); + initializeAMDGPUCtorDtorLoweringPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAttributorPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); @@ -1014,6 +1015,7 @@ disablePass(&PatchableFunctionID); addPass(createAMDGPUPrintfRuntimeBinding()); + addPass(createAMDGPUCtorDtorLoweringPass()); // This must occur before inlining, as the inliner will not look through // bitcast calls. diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -53,6 +53,7 @@ AMDGPUCodeGenPrepare.cpp AMDGPUExportClustering.cpp AMDGPUFixFunctionBitcasts.cpp + AMDGPUCtorDtorLowering.cpp AMDGPUFrameLowering.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInstCombineIntrinsic.cpp diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll @@ -0,0 +1,39 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +@llvm.global_ctors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @foo.5, i8* null }] + +define internal void @foo() { + ret void + +} + +define internal void @foo.5() { + ret void + +} + +; CHECK: --- +; CHECK: .kind: init +; CHECK: .name: amdgcn.device.init + +@llvm.global_dtors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @bar.5, i8* null }] + +define internal void @bar() { + ret void + +} + +define internal void @bar.5() { + ret void + +} + +; CHECK: .kind: fini +; CHECK: .name: amdgcn.device.fini + +; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -31,6 +31,7 @@ ; GCN-O0-NEXT: AMDGPU Printf lowering ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Dominator Tree Construction +; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O0-NEXT: Fix function bitcasts for AMDGPU ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Early propagate attributes from kernels to functions @@ -165,6 +166,7 @@ ; GCN-O1-NEXT: AMDGPU Printf lowering ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O1-NEXT: Fix function bitcasts for AMDGPU ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Early propagate attributes from kernels to functions @@ -415,6 +417,7 @@ ; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O1-OPTS-NEXT: Fix function bitcasts for AMDGPU ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Early propagate attributes from kernels to functions @@ -698,6 +701,7 @@ ; GCN-O2-NEXT: AMDGPU Printf lowering ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O2-NEXT: Fix function bitcasts for AMDGPU ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Early propagate attributes from kernels to functions @@ -983,6 +987,7 @@ ; GCN-O3-NEXT: AMDGPU Printf lowering ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O3-NEXT: Fix function bitcasts for AMDGPU ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Early propagate attributes from kernels to functions diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll @@ -0,0 +1,21 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-ctor-dtor < %s | FileCheck %s + +@llvm.global_ctors = appending addrspace(1) global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }] +@llvm.global_dtors = appending addrspace(1) global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }] + +; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.init() #0 +; CHECK-NEXT: call void @foo + +; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.fini() #1 +; CHECK-NEXT: call void @bar + +define internal void @foo() { + ret void +} + +define internal void @bar() { + ret void +} + +; CHECK: attributes #0 = { "device-init" } +; CHECK: attributes #1 = { "device-fini" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll @@ -0,0 +1,31 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-ctor-dtor < %s | FileCheck %s + +@llvm.global_ctors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @foo.5, i8* null }] +@llvm.global_dtors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @bar.5, i8* null }] + +; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.init() #0 +; CHECK-NEXT: call void @foo +; CHECK-NEXT: call void @foo.5 + +; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.fini() #1 +; CHECK-NEXT: call void @bar +; CHECK-NEXT: call void @bar.5 + +define internal void @foo() { + ret void +} + +define internal void @bar() { + ret void +} + +define internal void @foo.5() { + ret void +} + +define internal void @bar.5() { + ret void +} + +; CHECK: attributes #0 = { "device-init" } +; CHECK: attributes #1 = { "device-fini" } diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -133,6 +133,7 @@ "AMDGPUCodeGenPrepare.cpp", "AMDGPUExportClustering.cpp", "AMDGPUFixFunctionBitcasts.cpp", + "AMDGPUCtorDtorLowering.cpp", "AMDGPUFrameLowering.cpp", "AMDGPUGlobalISelUtils.cpp", "AMDGPUHSAMetadataStreamer.cpp",