diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -30,11 +30,13 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/InstrTypes.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/Cloning.h" +#include #define DEBUG_TYPE "amdgpu-propagate-attributes" @@ -113,12 +115,17 @@ // Clone functions as needed or just set attributes. bool AllowClone; + CallGraph *ModuleCG = nullptr; + // Option propagation roots. SmallSet Roots; // Clones of functions with their attributes. SmallVector Clones; + // To memoize address taken functions. + SmallSet AddressTakenFunctions; + // Find a clone with required features. Function *findFunction(const FnProperties &PropsNeeded, Function *OrigF); @@ -139,14 +146,23 @@ bool process(); public: - AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) : - TM(TM), AllowClone(AllowClone) {} + AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) + : TM(TM), AllowClone(AllowClone) {} // Use F as a root and propagate its attributes. bool process(Function &F); // Propagate attributes starting from kernel functions. - bool process(Module &M); + bool process(Module &M, CallGraph *CG); + + // Remove attributes from F. + // This is used in presence of address taken functions. + bool removeAttributes(Function *F); + + // Handle call graph rooted at address taken functions. + // This function will erase all attributes present + // on all functions called from address taken functions transitively. + bool handleAddressTakenFunctions(CallGraph *CG); }; // Allows to propagate attributes early, but no clonning is allowed as it must @@ -182,6 +198,7 @@ *PassRegistry::getPassRegistry()); } + void getAnalysisUsage(AnalysisUsage &AU) const override; bool runOnModule(Module &M) override; }; @@ -199,6 +216,49 @@ "Late propagate attributes from kernels to functions", false, false) +bool AMDGPUPropagateAttributes::removeAttributes(Function *F) { + bool Changed = false; + if (!F) + return Changed; + LLVM_DEBUG(dbgs() << "Removing attributes from " << F->getName() << '\n'); + for (unsigned I = 0; I < NumAttr; ++I) { + if (F->hasFnAttribute(AttributeNames[I])) { + F->removeFnAttr(AttributeNames[I]); + Changed = true; + } + } + return Changed; +} + +bool AMDGPUPropagateAttributes::handleAddressTakenFunctions(CallGraph *CG) { + assert(ModuleCG && "Call graph not present"); + + bool Changed = false; + SmallSet Visited; + + for (Function *F : AddressTakenFunctions) { + CallGraphNode *CGN = (*CG)[F]; + if (!Visited.count(CGN)) { + Changed |= removeAttributes(F); + Visited.insert(CGN); + } + + std::queue SubGraph; + SubGraph.push(CGN); + while (!SubGraph.empty()) { + CallGraphNode *CGN = SubGraph.front(); + SubGraph.pop(); + if (!Visited.count(CGN)) { + Changed |= removeAttributes(CGN->getFunction()); + Visited.insert(CGN); + } + for (auto N : *CGN) + SubGraph.push(N.second); + } + } + return Changed; +} + Function * AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded, Function *OrigF) { @@ -210,11 +270,12 @@ return nullptr; } -bool AMDGPUPropagateAttributes::process(Module &M) { +bool AMDGPUPropagateAttributes::process(Module &M, CallGraph *CG) { for (auto &F : M.functions()) if (AMDGPU::isEntryFunctionCC(F.getCallingConv())) Roots.insert(&F); + ModuleCG = CG; return process(); } @@ -240,6 +301,9 @@ if (F.isDeclaration()) continue; + if (F.hasAddressTaken(nullptr, true, true, true)) + AddressTakenFunctions.insert(&F); + const FnProperties CalleeProps(*TM, F); SmallVector, 32> ToReplace; SmallSet Visited; @@ -310,6 +374,12 @@ Roots.clear(); Clones.clear(); + // Keep the post processing related to indirect + // calls separate to handle them gracefully. + // The core traversal need not be affected by this. + if (AllowClone) + Changed |= handleAddressTakenFunctions(ModuleCG); + return Changed; } @@ -389,6 +459,10 @@ return AMDGPUPropagateAttributes(TM, false).process(F); } +void AMDGPUPropagateAttributesLate::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); +} + bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) { if (!TM) { auto *TPC = getAnalysisIfAvailable(); @@ -397,8 +471,8 @@ TM = &TPC->getTM(); } - - return AMDGPUPropagateAttributes(TM, true).process(M); + CallGraph &CG = getAnalysis().getCallGraph(); + return AMDGPUPropagateAttributes(TM, true).process(M, &CG); } FunctionPass @@ -423,8 +497,9 @@ } PreservedAnalyses -AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &AM) { - return AMDGPUPropagateAttributes(&TM, true).process(M) - ? PreservedAnalyses::none() - : PreservedAnalyses::all(); +AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &MAM) { + AMDGPUPropagateAttributes APA(&TM, true); + CallGraph &CG = MAM.getResult(M); + const bool Changed = APA.process(M, &CG); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -65,6 +65,7 @@ ; GCN-O1-NEXT: AMDGPU Printf lowering ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: CallGraph Construction ; GCN-O1-NEXT: Late propagate attributes from kernels to functions ; GCN-O1-NEXT: Interprocedural Sparse Conditional Constant Propagation ; GCN-O1-NEXT: FunctionPass Manager @@ -374,6 +375,7 @@ ; GCN-O2-NEXT: AMDGPU Printf lowering ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: CallGraph Construction ; GCN-O2-NEXT: Late propagate attributes from kernels to functions ; GCN-O2-NEXT: Interprocedural Sparse Conditional Constant Propagation ; GCN-O2-NEXT: FunctionPass Manager @@ -728,6 +730,7 @@ ; GCN-O3-NEXT: AMDGPU Printf lowering ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: CallGraph Construction ; GCN-O3-NEXT: Late propagate attributes from kernels to functions ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll @@ -0,0 +1,55 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s + +; CHECK-LABEL: define float @common() { +define float @common() { + ret float 0.0 +} + +; CHECK-LABEL: define float @foo() { +define float @foo() { + %direct_call = call contract float @common() + ret float %direct_call +} + +; CHECK-LABEL: define float @bar() { +define float @bar() { + ret float 0.0 +} + +; CHECK-LABEL: define float @baz() { +define float @baz() { + ret float 0.0 +} + +define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 { + %fn = alloca float ()* + switch i32 %type, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: + store float ()* @foo, float ()** %fn + br label %sw.epilog + +sw.bb2: + store float ()* @bar, float ()** %fn + br label %sw.epilog + +sw.bb3: + store float ()* @baz, float ()** %fn + br label %sw.epilog + +sw.default: + br label %sw.epilog + +sw.epilog: + %fp = load float ()*, float ()** %fn + %direct_call = call contract float @common() + %indirect_call = call contract float %fp() + store float %indirect_call, float* %result + ret void +} + +attributes #1 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll @@ -0,0 +1,53 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s + +; Test to check if we skip propgating attributes even if +; a function is called directly as well as +; indirectly. "baz" is called directly as well indirectly. + +; CHECK-LABEL: define float @foo() +define float @foo() #1 { + ret float 0.0 +} + +; CHECK-LABEL: define float @bar() +define float @bar() #1 { + ret float 0.0 +} + +; CHECK-LABEL: define float @baz() +define float @baz() #1 { + ret float 0.0 +} + +define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 { + %fn = alloca float ()* + switch i32 %type, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: + store float ()* @foo, float ()** %fn + br label %sw.epilog + +sw.bb2: + store float ()* @bar, float ()** %fn + br label %sw.epilog + +sw.bb3: + store float ()* @baz, float ()** %fn + br label %sw.epilog + +sw.default: + br label %sw.epilog + +sw.epilog: + %fp = load float ()*, float ()** %fn + %direct_call = call contract float @baz() + %indirect_call = call contract float %fp() + store float %indirect_call, float* %result + ret void +} + +attributes #1 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll --- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll @@ -30,11 +30,14 @@ ; In order to expose this bug, it is necessary that `g` have one of the ; propagated attributes, so that a clone and substitution would take place if g ; were actually the function being called. -; CHECK-DAG: define private void @g.1() #1 -; CHECK-DAG: define internal void @g() #2 +; CHECK-DAG: define private void @g.1() #0 +; CHECK-DAG: define internal void @g() #1 define private void @g() #1 { ret void } attributes #0 = { noinline } attributes #1 = { noinline "amdgpu-waves-per-eu"="1,10" } + +; CHECK: attributes #0 = { noinline } +; CHECK-NEXT: attributes #1 = { noinline "target-features"="+enable-ds128,+enable-prt-strict-null,+flat-address-space,+flat-for-global,+load-store-opt,+promote-alloca,+trap-handler,+unaligned-access-mode,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll @@ -0,0 +1,34 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s + +; Test to check if we skip attributes on address +; taken functions and its callees. + +; CHECK-LABEL: define float @bar() { +define float @bar() { + ret float 0.0 +} + +; CHECK-LABEL: define float @baz() { +define float @baz() { + ret float 0.0 +} + +; CHECK-LABEL: define float @foo() { +define float @foo() { + %v1 = call contract float @bar() + %v2 = call contract float @baz() + %v3 = fadd float %v1, %v2 + ret float %v3 +} + +; CHECK-LABEL: define amdgpu_kernel void @kernel(float* %result, i32 %type) #0 { +define amdgpu_kernel void @kernel(float *%result, i32 %type) #1 { + %fn = alloca float ()* + store float ()* @foo, float ()** %fn + %fp = load float ()*, float ()** %fn + %indirect_call = call contract float %fp() + store float %indirect_call, float* %result + ret void +} + +attributes #1 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll @@ -0,0 +1,32 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s + +; Test attributes on a function which +; is called indirectly from two kernels +; having different launch bounds. + +; This function should not have any attributes on it. +; CHECK-LABEL: define float @foo() { +define float @foo() { + ret float 0.0 +} + +define amdgpu_kernel void @kernel1(float *%result, i32 %type) #1 { + %fn = alloca float ()* + store float ()* @foo, float ()** %fn + %fp = load float ()*, float ()** %fn + %indirect_call = call contract float %fp() + store float %indirect_call, float* %result + ret void +} + +define amdgpu_kernel void @kernel2(float *%result, i32 %type) #2 { + %fn = alloca float ()* + store float ()* @foo, float ()** %fn + %fp = load float ()*, float ()** %fn + %indirect_call = call contract float %fp() + store float %indirect_call, float* %result + ret void +} + +attributes #1 = { "amdgpu-flat-work-group-size"="1,256" } +attributes #2 = { "amdgpu-flat-work-group-size"="1,512" }