diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -236,6 +236,16 @@ if (F.isDeclaration()) continue; + // Skip propagating attributes and features to + // address taken functions. + if (F.hasAddressTaken()) { + if (!Roots.count(&F) && !NewRoots.count(&F)) { + NewRoots.insert(&F); + Changed = true; + } + continue; + } + const FnProperties CalleeProps(*TM, F); SmallVector, 32> ToReplace; SmallSet Visited; @@ -255,7 +265,11 @@ const FnProperties CallerProps(*TM, *Caller); - if (CalleeProps == CallerProps) { + // Convergence is allowed if the caller has its + // address taken because all callee's (attributes + features) + // may not agree as the callee may be the target of + // more than one function (called directly or indirectly). + if (Caller->hasAddressTaken() || CalleeProps == CallerProps) { if (!Roots.count(&F)) NewRoots.insert(&F); continue; diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll @@ -0,0 +1,79 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" + +; Complicated call graph where a function is called +; directly from a kernel abd also from a function +; whose address is taken. + +; CHECK-LABEL: define float @common_callee.gc(i32 %a) #0 { +define float @common_callee.gc(i32 %a) { + %add = add i32 %a, 6 + %mul = mul nsw i32 %add, 9 + %div = sdiv i32 %mul, 8 + %f = sitofp i32 %div to float + ret float %f +} + +; CHECK-LABEL: define float @foo(i32 %a) { +define float @foo(i32 %a) { +entry: + %mul = mul nsw i32 %a, 5 + %cast = sitofp i32 %mul to float + + ret float %cast +} + +; CHECK-LABEL: define float @bar(i32 %a) { +define float @bar(i32 %a) { +entry: + %div = sdiv i32 %a, 7 + %direct_call = call contract float @common_callee.gc(i32 5) + ret float %direct_call +} + +; CHECK-LABEL: define float @baz(i32 %a) { +define float @baz(i32 %a) { +entry: + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 { +entry: + %fn = alloca float (i32)*, align 8, addrspace(5) + switch i32 %type, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: + store float (i32)* @foo, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb2: + store float (i32)* @bar, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb3: + store float (i32)* @baz, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.default: + br label %sw.epilog + +sw.epilog: + %fp = load float (i32)*, float (i32)* addrspace(5)* %fn, align 8 + %direct_call = call contract float @common_callee.gc(i32 4) + %conv = fptosi float %direct_call to i32 + %call4 = call contract float %fp(i32 %conv) + store float %call4, float* %result, align 4 + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "target-features"="+16-bit-insts,+add-no-carry-insts,+aperture-regs,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+ds-src2-insts,+enable-ds128,+enable-prt-strict-null,+fast-denormal-f32,+fast-fmaf,+flat-address-space,+flat-for-global,+flat-global-insts,+flat-inst-offsets,+flat-scratch-insts,+fma-mix-insts,+fp64,+gcn3-encoding,+gfx7-gfx8-gfx9-insts,+gfx8-insts,+gfx9,+gfx9-insts,+half-rate-64-ops,+image-gather4-d16-bug,+int-clamp-insts,+inv-2pi-inline-imm,+ldsbankcount32,+load-store-opt,+localmemorysize65536,+mad-mac-f32-insts,+no-xnack-support,+promote-alloca,+r128-a16,+s-memrealtime,+s-memtime-inst,+scalar-atomics,+scalar-flat-scratch-insts,+scalar-stores,+sdwa,+sdwa-omod,+sdwa-scalar,+sdwa-sdst,+sram-ecc,+trap-handler,+unaligned-access-mode,+unaligned-buffer-access,+unaligned-ds-access,+vgpr-index-mode,+vop3p,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64,+xnack" } +attributes #1 = { convergent norecurse nounwind mustprogress + "amdgpu-flat-work-group-size"="1,256"} diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll @@ -0,0 +1,69 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" + +; Test to check if we skip propgating attributes even if +; a function is called directly as well as +; indirectly. "baz" is called directly as well indirectly. + +; CHECK-LABEL: define float @foo(i32 %a) { +define float @foo(i32 %a) { +entry: + %mul = mul nsw i32 %a, 5 + %cast = sitofp i32 %mul to float + + ret float %cast +} + +; CHECK-LABEL: define float @bar(i32 %a) { +define float @bar(i32 %a) { +entry: + %div = sdiv i32 %a, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +; CHECK-LABEL: define float @baz(i32 %a) { +define float @baz(i32 %a) { +entry: + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 { +entry: + %fn = alloca float (i32)*, align 8, addrspace(5) + switch i32 %type, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: + store float (i32)* @foo, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb2: + store float (i32)* @bar, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb3: + store float (i32)* @baz, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.default: + br label %sw.epilog + +sw.epilog: + %fp = load float (i32)*, float (i32)* addrspace(5)* %fn, align 8 + %direct_call = call contract float @baz(i32 4) + %conv = fptosi float %direct_call to i32 + %call4 = call contract float %fp(i32 %conv) + store float %call4, float* %result, align 4 + ret void +} + +attributes #1 = { convergent norecurse nounwind mustprogress + "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect.ll @@ -0,0 +1,79 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" + +; Test to check if we skip attributes on address +; taken functions but pass to direct callees. + +; CHECK-LABEL: define float @foo(i32 %a) { +define float @foo(i32 %a) { +entry: + %mul = mul nsw i32 %a, 5 + %cast = sitofp i32 %mul to float + + ret float %cast +} + +; CHECK-LABEL: define float @bar(i32 %a) { +define float @bar(i32 %a) { +entry: + %div = sdiv i32 %a, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +; CHECK-LABEL: define float @baz(i32 %a) { +define float @baz(i32 %a) { +entry: + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +; CHECK-LABEL: define float @baz2(i32 %a) #0 { +define float @baz2(i32 %a) { + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 8 + %add = add i32 %div , 12 + %conv = sitofp i32 %add to float + ret float %conv +} + +define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 { +entry: + %fn = alloca float (i32)*, align 8, addrspace(5) + switch i32 %type, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: + store float (i32)* @foo, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb2: + store float (i32)* @bar, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb3: + store float (i32)* @baz, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.default: + br label %sw.epilog + +sw.epilog: + %fp = load float (i32)*, float (i32)* addrspace(5)* %fn, align 8 + %direct_call = call contract float @baz2(i32 5) + %conv = fptosi float %direct_call to i32 + %call4 = call contract float %fp(i32 %conv) + store float %call4, float* %result, align 4 + ret void +} + + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "target-features"="+16-bit-insts,+add-no-carry-insts,+aperture-regs,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+ds-src2-insts,+enable-ds128,+enable-prt-strict-null,+fast-denormal-f32,+fast-fmaf,+flat-address-space,+flat-for-global,+flat-global-insts,+flat-inst-offsets,+flat-scratch-insts,+fma-mix-insts,+fp64,+gcn3-encoding,+gfx7-gfx8-gfx9-insts,+gfx8-insts,+gfx9,+gfx9-insts,+half-rate-64-ops,+image-gather4-d16-bug,+int-clamp-insts,+inv-2pi-inline-imm,+ldsbankcount32,+load-store-opt,+localmemorysize65536,+mad-mac-f32-insts,+no-xnack-support,+promote-alloca,+r128-a16,+s-memrealtime,+s-memtime-inst,+scalar-atomics,+scalar-flat-scratch-insts,+scalar-stores,+sdwa,+sdwa-omod,+sdwa-scalar,+sdwa-sdst,+sram-ecc,+trap-handler,+unaligned-access-mode,+unaligned-buffer-access,+unaligned-ds-access,+vgpr-index-mode,+vop3p,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64,+xnack" } +attributes #1 = { convergent norecurse nounwind mustprogress + "amdgpu-flat-work-group-size"="1,256"} diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll @@ -0,0 +1,67 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" + +; Test to check if we skip attributes on address +; taken functions in a simple call graph. + +; CHECK-LABEL: define float @foo(i32 %a) { +define float @foo(i32 %a) { +entry: + %mul = mul nsw i32 %a, 5 + %cast = sitofp i32 %mul to float + + ret float %cast +} + +; CHECK-LABEL: define float @bar(i32 %a) { +define float @bar(i32 %a) { +entry: + %div = sdiv i32 %a, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +; CHECK-LABEL: define float @baz(i32 %a) { +define float @baz(i32 %a) { +entry: + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 { +entry: + %fn = alloca float (i32)*, align 8, addrspace(5) + switch i32 %type, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: + store float (i32)* @foo, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb2: + store float (i32)* @bar, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb3: + store float (i32)* @baz, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.default: + br label %sw.epilog + +sw.epilog: + %fp = load float (i32)*, float (i32)* addrspace(5)* %fn, align 8 + %call4 = call contract float %fp(i32 7) + store float %call4, float* %result, align 4 + ret void +} + +attributes #1 = { convergent norecurse nounwind mustprogress + "amdgpu-flat-work-group-size"="1,256"} +