Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -46,8 +46,11 @@ class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: const TargetMachine *TM = nullptr; + SmallVector NodeList; bool addFeatureAttributes(Function &F); + bool processUniformWorkGroupAttribute(); + bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); public: static char ID; @@ -186,7 +189,6 @@ Parent.addFnAttr(Name); return true; } - return false; } @@ -213,6 +215,55 @@ handleAttr(Parent, Callee, AttrName); } +bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { + bool Changed = false; + + for (auto *Node : reverse(NodeList)) { + Function *Caller = Node->getFunction(); + + for (auto I : *Node) { + Function *Callee = std::get<1>(I)->getFunction(); + if (Callee) + Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); + } + } + + return Changed; +} + +bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee) { + + // Check for externally defined function + if (!Callee.hasExactDefinition()) { + Callee.addFnAttr("uniform-work-group-size", "false"); + if (!Caller.hasFnAttribute("uniform-work-group-size")) + Caller.addFnAttr("uniform-work-group-size", "false"); + + return true; + } + // Check if the Caller has the attribute + if (Caller.hasFnAttribute("uniform-work-group-size")) { + // Check if the value of the attribute is true + if (Caller.getFnAttribute("uniform-work-group-size") + .getValueAsString().equals("true")) { + // Propagate the attribute to the Callee, if it does not have it + if (!Callee.hasFnAttribute("uniform-work-group-size")) { + Callee.addFnAttr("uniform-work-group-size", "true"); + return true; + } + } else { + Callee.addFnAttr("uniform-work-group-size", "false"); + return true; + } + } else { + // If the attribute is absent, set it as false + Caller.addFnAttr("uniform-work-group-size", "false"); + Callee.addFnAttr("uniform-work-group-size", "false"); + return true; + } + return false; +} + bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { const GCNSubtarget &ST = TM->getSubtarget(F); bool HasFlat = ST.hasFlatAddressSpace(); @@ -293,15 +344,19 @@ } bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { - Module &M = SCC.getCallGraph().getModule(); - Triple TT(M.getTargetTriple()); - bool Changed = false; + for (CallGraphNode *I : SCC) { - Function *F = I->getFunction(); + // Build a list of CallGraphNodes from most number of uses to least + if (I->getNumReferences()) + NodeList.push_back(I); + else + processUniformWorkGroupAttribute(); + + Function *F = I->getFunction(); + // Add feature attributes if (!F || F->isDeclaration()) continue; - Changed |= addFeatureAttributes(*F); } Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -675,6 +675,9 @@ } void AMDGPUPassConfig::addCodeGenPrepare() { + if (TM->getTargetTriple().getArch() == Triple::amdgcn) + addPass(createAMDGPUAnnotateKernelFeaturesPass()); + if (TM->getTargetTriple().getArch() == Triple::amdgcn && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); @@ -762,7 +765,6 @@ // FIXME: We need to run a pass to propagate the attributes when calls are // supported. - addPass(createAMDGPUAnnotateKernelFeaturesPass()); // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -244,52 +244,52 @@ ret void } -; HSA: define void @use_implicitarg_ptr() #15 { +; HSA: define void @use_implicitarg_ptr() #16 { define void @use_implicitarg_ptr() #1 { %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } -; HSA: define void @func_indirect_use_implicitarg_ptr() #15 { +; HSA: define void @func_indirect_use_implicitarg_ptr() #16 { define void @func_indirect_use_implicitarg_ptr() #1 { call void @use_implicitarg_ptr() ret void } -; HSA: declare void @external.func() #16 +; HSA: declare void @external.func() #17 declare void @external.func() #3 -; HSA: define internal void @defined.func() #16 { +; HSA: define internal void @defined.func() #17 { define internal void @defined.func() #3 { ret void } -; HSA: define void @func_call_external() #16 { +; HSA: define void @func_call_external() #17 { define void @func_call_external() #3 { call void @external.func() ret void } -; HSA: define void @func_call_defined() #16 { +; HSA: define void @func_call_defined() #17 { define void @func_call_defined() #3 { call void @defined.func() ret void } -; HSA: define void @func_call_asm() #16 { +; HSA: define void @func_call_asm() #17 { define void @func_call_asm() #3 { call void asm sideeffect "", ""() #3 ret void } -; HSA: define amdgpu_kernel void @kern_call_external() #17 { +; HSA: define amdgpu_kernel void @kern_call_external() #18 { define amdgpu_kernel void @kern_call_external() #3 { call void @external.func() ret void } -; HSA: define amdgpu_kernel void @func_kern_defined() #17 { +; HSA: define amdgpu_kernel void @func_kern_defined() #18 { define amdgpu_kernel void @func_kern_defined() #3 { call void @defined.func() ret void @@ -301,20 +301,20 @@ attributes #3 = { nounwind } ; HSA: attributes #0 = { nounwind readnone speculatable } -; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" } -; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" } -; HSA: attributes #3 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" } -; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-x" "target-cpu"="fiji" } -; HSA: attributes #5 = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" } -; HSA: attributes #6 = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" } -; HSA: attributes #7 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" } -; HSA: attributes #8 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" } -; HSA: attributes #9 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" } +; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #3 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-x" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #5 = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #6 = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #7 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #8 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #9 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; HSA: attributes #10 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "target-cpu"="fiji" } -; HSA: attributes #11 = { nounwind "target-cpu"="fiji" } -; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" } -; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" } -; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" } +; HSA: attributes #11 = { nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; HSA: attributes #15 = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" } -; HSA: attributes #16 = { nounwind } -; HSA: attributes #17 = { nounwind "amdgpu-flat-scratch" } +; HSA: attributes #16 = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #17 = { nounwind "uniform-work-group-size"="false" } Index: test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll @@ -0,0 +1,18 @@ +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s + +; If the kernel does not have the uniform-work-group-attribute, set both callee and caller as false + +; CHECK: define void @foo() #[[FOO:[0-9]+]] { +define void @foo() #0 { + ret void +} + +; CHECK: define amdgpu_kernel void @kernel1() #[[FOO]] { +define amdgpu_kernel void @kernel1() #1 { + call void @foo() + ret void +} + +attributes #0 = { "uniform-work-group-size"="true" } + +; CHECK: attributes #[[FOO]] = { "uniform-work-group-size"="false" } Index: test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll @@ -0,0 +1,24 @@ +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s + +; Test to verify if the attribute gets propagated across nested function calls + +; CHECK: define void @func1() #[[FUNC:[0-9]+]] { +define void @func1() #0 { + ret void +} + +; CHECK: define void @func2() #[[FUNC]] { +define void @func2() #1 { + call void @func1() + ret void +} + +; CHECK: define amdgpu_kernel void @kernel3() #[[FUNC:[0-9]+]] { +define amdgpu_kernel void @kernel3() #2 { + call void @func2() + ret void +} + +attributes #2 = { "uniform-work-group-size"="true" } + +; CHECK: attributes #[[FUNC]] = { "uniform-work-group-size"="true" } Index: test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll @@ -0,0 +1,25 @@ +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s + +; Two kernels with different values of the uniform-work-group-attribute call the same function + +; CHECK: define void @func() #[[FUNC:[0-9]+]] { +define void @func() #0 { + ret void +} + +; CHECK: define amdgpu_kernel void @kernel1() #[[KERNEL1:[0-9]+]] { +define amdgpu_kernel void @kernel1() #1 { + call void @func() + ret void +} + +; CHECK: define amdgpu_kernel void @kernel2() #[[FUNC]] { +define amdgpu_kernel void @kernel2() #2 { + call void @func() + ret void +} + +attributes #1 = { "uniform-work-group-size"="true" } + +; CHECK: attributes #[[FUNC]] = { "uniform-work-group-size"="false" } +; CHECK: attributes #[[KERNEL1]] = { "uniform-work-group-size"="true" } Index: test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll @@ -0,0 +1,33 @@ +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s + +; Propagate the uniform-work-group-attribute from the kernel to callee if it doesn't have it +; CHECK: define void @func() #[[FUNC:[0-9]+]] { +define void @func() #0 { + ret void +} + +; CHECK: define amdgpu_kernel void @kernel1() #[[KERNEL1:[0-9]+]] { +define amdgpu_kernel void @kernel1() #1 { + call void @func() + ret void +} + +; External declaration of a function +; CHECK: define weak_odr void @weak_func() #[[FUNC]] { +define weak_odr void @weak_func() #0 { + ret void +} + +; CHECK: define amdgpu_kernel void @kernel2() #[[KERNEL2:[0-9]+]] { +define amdgpu_kernel void @kernel2() #2 { + call void @weak_func() + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { "uniform-work-group-size"="false" } +attributes #2 = { "uniform-work-group-size"="true" } + +; CHECK: attributes #[[FUNC]] = { nounwind "uniform-work-group-size"="false" } +; CHECK: attributes #[[KERNEL1]] = { "uniform-work-group-size"="false" } +; CHECK: attributes #[[KERNEL2]] = { "uniform-work-group-size"="true" } Index: test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll @@ -0,0 +1,37 @@ +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s + +; Test to ensure recursive functions exhibit proper behaviour +; Test to generate fibonacci numbers + +; CHECK: define i32 @fib(i32 %n) #[[FIB:[0-9]+]] { +define i32 @fib(i32 %n) #0 { + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %exit, label %cont1 + +cont1: + %cmp2 = icmp eq i32 %n, 1 + br i1 %cmp2, label %exit, label %cont2 + +cont2: + %nm1 = sub i32 %n, 1 + %fibm1 = call i32 @fib(i32 %nm1) + %nm2 = sub i32 %n, 2 + %fibm2 = call i32 @fib(i32 %nm2) + %retval = add i32 %fibm1, %fibm2 + + ret i32 %retval + +exit: + ret i32 1 +} + +; CHECK: define amdgpu_kernel void @kernel(i32 addrspace(1)* %m) #[[FIB]] { +define amdgpu_kernel void @kernel(i32 addrspace(1)* %m) #1 { + %r = call i32 @fib(i32 5) + store i32 %r, i32 addrspace(1)* %m + ret void +} + +attributes #1 = { "uniform-work-group-size"="true" } + +; CHECK: attributes #[[FIB]] = { "uniform-work-group-size"="true" } Index: test/CodeGen/AMDGPU/uniform-work-group-test.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/uniform-work-group-test.ll @@ -0,0 +1,35 @@ +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s + +; CHECK: define void @func1() #[[FUNC:[0-9]+]] { +define void @func1() #0 { + ret void +} + +; CHECK: define void @func4() #[[FUNC]] { +define void @func4() #1 { + ret void +} + +; CHECK: define void @func2() #[[FUNC]] { +define void @func2() #1 { + call void @func4() + call void @func1() + ret void +} + +; CHECK: define void @func3() #[[FUNC]] { +define void @func3() #1 { + call void @func1() + ret void +} + +; CHECK: define amdgpu_kernel void @kernel3() #[[FUNC]] { +define amdgpu_kernel void @kernel3() #2 { + call void @func2() + call void @func3() + ret void +} + +attributes #2 = { "uniform-work-group-size"="true" } + +; CHECK: attributes #[[FUNC]] = { "uniform-work-group-size"="true" }