Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -57,7 +57,7 @@ void initializeAMDGPUAlwaysInlinePass(PassRegistry&); -ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); +Pass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -15,8 +15,10 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -26,26 +28,30 @@ namespace { -class AMDGPUAnnotateKernelFeatures : public ModulePass { +class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: + const TargetMachine *TM = nullptr; AMDGPUAS AS; - static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS); - void addAttrToCallers(Function *Intrin, StringRef AttrName); + bool addFeatureAttributes(Function &F); + + void addAttrToCallers(Function &Intrin, StringRef AttrName); bool addAttrsForIntrinsics(Module &M, ArrayRef); public: static char ID; - AMDGPUAnnotateKernelFeatures() : ModulePass(ID) {} - bool runOnModule(Module &M) override; + AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} + + bool doInitialization(CallGraph &CG) override; + bool runOnSCC(CallGraphSCC &SCC) override; StringRef getPassName() const override { return "AMDGPU Annotate Kernel Features"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); - ModulePass::getAnalysisUsage(AU); + CallGraphSCCPass::getAnalysisUsage(AU); } static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS); @@ -121,16 +127,108 @@ return false; } -// Return true if an addrspacecast is used that requires the queue ptr. -bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F, - AMDGPUAS AS) { +// We do not need to note the x workitem or workgroup id because they are always +// initialized. +// +// TODO: We should not add the attributes if the known compile time workgroup +// size is 1 for y/z. +static StringRef intrinsicToAttrName(Intrinsic::ID ID, bool &IsQueuePtr) { + switch (ID) { + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + return "amdgpu-work-item-id-y"; + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + return "amdgpu-work-item-id-z"; + case Intrinsic::amdgcn_workgroup_id_y: + case Intrinsic::r600_read_tgid_y: + return "amdgpu-work-group-id-y"; + case Intrinsic::amdgcn_workgroup_id_z: + case Intrinsic::r600_read_tgid_z: + return "amdgpu-work-group-id-z"; + case Intrinsic::amdgcn_dispatch_ptr: + return "amdgpu-dispatch-ptr"; + case Intrinsic::amdgcn_dispatch_id: + return "amdgpu-dispatch-id"; + case Intrinsic::amdgcn_queue_ptr: + case Intrinsic::trap: + case Intrinsic::debugtrap: + IsQueuePtr = true; + return "amdgpu-queue-ptr"; + default: + return ""; + } +} + +static bool handleAttr(Function &Parent, const Function &Callee, + StringRef Name) { + if (Callee.hasFnAttribute(Name)) { + Parent.addFnAttr(Name); + return true; + } + + return false; +} + +static void copyFeaturesToFunction(Function &Parent, const Function &Callee, + bool &NeedQueuePtr) { + + static const StringRef AttrNames[] = { + // .x omitted + { "amdgpu-work-item-id-y" }, + { "amdgpu-work-item-id-z" }, + // .x omitted + { "amdgpu-work-group-id-y" }, + { "amdgpu-work-group-id-z" }, + { "amdgpu-dispatch-ptr" }, + { "amdgpu-dispatch-id" } + }; + + if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) + NeedQueuePtr = true; + + for (StringRef AttrName : AttrNames) + handleAttr(Parent, Callee, AttrName); +} + +bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { + bool HasApertureRegs = TM->getSubtarget(F).hasApertureRegs(); SmallPtrSet ConstantExprVisited; - for (const BasicBlock &BB : F) { - for (const Instruction &I : BB) { + bool Changed = false; + bool NeedQueuePtr = false; + + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + CallSite CS(&I); + if (CS) { + Function *Callee = CS.getCalledFunction(); + + // TODO: Do something with indirect calls. + if (!Callee) + continue; + + Intrinsic::ID IID = Callee->getIntrinsicID(); + if (IID == Intrinsic::not_intrinsic) { + copyFeaturesToFunction(F, *Callee, NeedQueuePtr); + Changed = true; + } else { + StringRef AttrName = intrinsicToAttrName(IID, NeedQueuePtr); + if (!AttrName.empty()) { + F.addFnAttr(AttrName); + Changed = true; + } + } + } + + if (NeedQueuePtr || HasApertureRegs) + continue; + if (const AddrSpaceCastInst *ASC = dyn_cast(&I)) { - if (castRequiresQueuePtr(ASC, AS)) - return true; + if (castRequiresQueuePtr(ASC, AS)) { + NeedQueuePtr = true; + continue; + } } for (const Use &U : I.operands()) { @@ -138,20 +236,27 @@ if (!OpC) continue; - if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) - return true; + if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) { + NeedQueuePtr = true; + break; + } } } } - return false; + if (NeedQueuePtr) { + F.addFnAttr("amdgpu-queue-ptr"); + Changed = true; + } + + return Changed; } -void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, +void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function &Intrin, StringRef AttrName) { SmallPtrSet SeenFuncs; - for (User *U : Intrin->users()) { + for (User *U : Intrin.users()) { // CallInst is the only valid user for an intrinsic. CallInst *CI = cast(U); @@ -168,7 +273,7 @@ for (const StringRef *Arr : IntrinsicToAttr) { if (Function *Fn = M.getFunction(Arr[0])) { - addAttrToCallers(Fn, Arr[1]); + addAttrToCallers(*Fn, Arr[1]); Changed = true; } } @@ -176,62 +281,33 @@ return Changed; } -bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { +bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { + Module &M = SCC.getCallGraph().getModule(); Triple TT(M.getTargetTriple()); - AS = AMDGPU::getAMDGPUAS(M); - - static const StringRef IntrinsicToAttr[][2] = { - // .x omitted - { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" }, - { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" }, - - { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" }, - { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" }, - - { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, - { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, - - // .x omitted - { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" }, - { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" } - }; - - static const StringRef HSAIntrinsicToAttr[][2] = { - { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" }, - { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" }, - { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" }, - { "llvm.trap", "amdgpu-queue-ptr" }, - { "llvm.debugtrap", "amdgpu-queue-ptr" } - }; - - // TODO: We should not add the attributes if the known compile time workgroup - // size is 1 for y/z. - // TODO: Intrinsics that require queue ptr. + bool Changed = false; + for (CallGraphNode *I : SCC) { + Function *F = I->getFunction(); + if (!F || F->isDeclaration()) + continue; - // We do not need to note the x workitem or workgroup id because they are - // always initialized. + Changed |= addFeatureAttributes(*F); + } - bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); - if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) { - Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); - for (Function &F : M) { - if (F.hasFnAttribute("amdgpu-queue-ptr")) - continue; + return Changed; +} - auto *TPC = getAnalysisIfAvailable(); - bool HasApertureRegs = TPC && TPC->getTM() - .getSubtarget(F) - .hasApertureRegs(); - if (!HasApertureRegs && hasAddrSpaceCast(F, AS)) - F.addFnAttr("amdgpu-queue-ptr"); - } - } +bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + report_fatal_error("TargetMachine is required"); - return Changed; + AS = AMDGPU::getAMDGPUAS(CG.getModule()); + TM = &TPC->getTM(); + return false; } -ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { +Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { return new AMDGPUAnnotateKernelFeatures(); } Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -0,0 +1,200 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=HSA %s + +declare i32 @llvm.amdgcn.workgroup.id.y() #0 +declare i32 @llvm.amdgcn.workgroup.id.z() #0 + +declare i32 @llvm.amdgcn.workitem.id.y() #0 +declare i32 @llvm.amdgcn.workitem.id.z() #0 + +declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 +declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 +declare i64 @llvm.amdgcn.dispatch.id() #0 + +; HSA: define void @use_workitem_id_y() #1 { +define void @use_workitem_id_y() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; HSA: define void @use_workitem_id_z() #2 { +define void @use_workitem_id_z() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; HSA: define void @use_workgroup_id_y() #3 { +define void @use_workgroup_id_y() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.y() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; HSA: define void @use_workgroup_id_z() #4 { +define void @use_workgroup_id_z() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; HSA: define void @use_dispatch_ptr() #5 { +define void @use_dispatch_ptr() #1 { + %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + store volatile i8 addrspace(2)* %dispatch.ptr, i8 addrspace(2)* addrspace(1)* undef + ret void +} + +; HSA: define void @use_queue_ptr() #6 { +define void @use_queue_ptr() #1 { + %queue.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() + store volatile i8 addrspace(2)* %queue.ptr, i8 addrspace(2)* addrspace(1)* undef + ret void +} + +; HSA: define void @use_dispatch_id() #7 { +define void @use_dispatch_id() #1 { + %val = call i64 @llvm.amdgcn.dispatch.id() + store volatile i64 %val, i64 addrspace(1)* undef + ret void +} + +; HSA: define void @use_workgroup_id_y_workgroup_id_z() #8 { +define void @use_workgroup_id_y_workgroup_id_z() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.y() + %val1 = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + ret void +} + +; HSA: define void @func_indirect_use_workitem_id_y() #1 { +define void @func_indirect_use_workitem_id_y() #1 { + call void @use_workitem_id_y() + ret void +} + +; HSA: define void @func_indirect_use_workitem_id_z() #2 { +define void @func_indirect_use_workitem_id_z() #1 { + call void @use_workitem_id_z() + ret void +} + +; HSA: define void @func_indirect_use_workgroup_id_y() #3 { +define void @func_indirect_use_workgroup_id_y() #1 { + call void @use_workgroup_id_y() + ret void +} + +; HSA: define void @func_indirect_use_workgroup_id_z() #4 { +define void @func_indirect_use_workgroup_id_z() #1 { + call void @use_workgroup_id_z() + ret void +} + +; HSA: define void @func_indirect_indirect_use_workgroup_id_y() #3 { +define void @func_indirect_indirect_use_workgroup_id_y() #1 { + call void @func_indirect_use_workgroup_id_y() + ret void +} + +; HSA: define void @indirect_x2_use_workgroup_id_y() #3 { +define void @indirect_x2_use_workgroup_id_y() #1 { + call void @func_indirect_indirect_use_workgroup_id_y() + ret void +} + +; HSA: define void @func_indirect_use_dispatch_ptr() #5 { +define void @func_indirect_use_dispatch_ptr() #1 { + call void @use_dispatch_ptr() + ret void +} + +; HSA: define void @func_indirect_use_queue_ptr() #6 { +define void @func_indirect_use_queue_ptr() #1 { + call void @use_queue_ptr() + ret void +} + +; HSA: define void @func_indirect_use_dispatch_id() #7 { +define void @func_indirect_use_dispatch_id() #1 { + call void @use_dispatch_id() + ret void +} + +; HSA: define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #9 { +define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 { + call void @func_indirect_use_workgroup_id_y_workgroup_id_z() + ret void +} + +; HSA: define void @recursive_use_workitem_id_y() #1 { +define void @recursive_use_workitem_id_y() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val, i32 addrspace(1)* undef + call void @recursive_use_workitem_id_y() + ret void +} + +; HSA: define void @call_recursive_use_workitem_id_y() #1 { +define void @call_recursive_use_workitem_id_y() #1 { + call void @recursive_use_workitem_id_y() + ret void +} + +; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #6 { +define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %stof + ret void +} + +; HSA: define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #10 { +define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #2 { + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %stof + ret void +} + +; HSA: define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #11 { +define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #2 { + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %stof + call void @func_indirect_use_queue_ptr() + ret void +} + +; HSA: define void @indirect_use_group_to_flat_addrspacecast() #6 { +define void @indirect_use_group_to_flat_addrspacecast() #1 { + call void @use_group_to_flat_addrspacecast(i32 addrspace(3)* null) + ret void +} + +; HSA: define void @indirect_use_group_to_flat_addrspacecast_gfx9() #9 { +define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 { + call void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* null) + ret void +} + +; HSA: define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #6 { +define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 { + call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* null) + ret void +} + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind "target-cpu"="fiji" } +attributes #2 = { nounwind "target-cpu"="gfx900" } + +; HSA: attributes #0 = { nounwind readnone speculatable } +; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" } +; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" } +; HSA: attributes #3 = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" } +; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" } +; HSA: attributes #5 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" } +; HSA: attributes #6 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" } +; HSA: attributes #7 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" } +; HSA: attributes #8 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "target-cpu"="fiji" } +; HSA: attributes #9 = { nounwind "target-cpu"="fiji" } +; HSA: attributes #10 = { nounwind "target-cpu"="gfx900" } +; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" }