diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -236,6 +236,11 @@ if (F.isDeclaration()) continue; + // Skip propagating attributes and features to + // address taken functions. + if (F.hasAddressTaken()) + continue; + const FnProperties CalleeProps(*TM, F); SmallVector, 32> ToReplace; SmallSet Visited; diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll @@ -0,0 +1,78 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" + +; Complicated call graph where a function is called +; directly from a kernel abd also from a function +; whose address is taken. + +; CHECK-LABEL: define float @common_callee.gc(i32 %a) #0 { +define float @common_callee.gc(i32 %a) { + %add = add i32 %a, 6 + %mul = mul nsw i32 %add, 9 + %div = sdiv i32 %mul, 8 + %f = sitofp i32 %div to float + ret float %f +} + +; CHECK-LABEL: define float @foo(i32 %a) { +define float @foo(i32 %a) { +entry: + %mul = mul nsw i32 %a, 5 + %cast = sitofp i32 %mul to float + + ret float %cast +} + +; CHECK-LABEL: define float @bar(i32 %a) { +define float @bar(i32 %a) { +entry: + %div = sdiv i32 %a, 7 + %direct_call = call contract float @common_callee.gc(i32 5) + ret float %direct_call +} + +; CHECK-LABEL: define float @baz(i32 %a) { +define float @baz(i32 %a) { +entry: + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 { +entry: + %fn = alloca float (i32)*, align 8, addrspace(5) + switch i32 %type, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: + store float (i32)* @foo, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb2: + store float (i32)* @bar, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb3: + store float (i32)* @baz, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.default: + br label %sw.epilog + +sw.epilog: + %fp = load float (i32)*, float (i32)* addrspace(5)* %fn, align 8 + %direct_call = call contract float @common_callee.gc(i32 4) + %conv = fptosi float %direct_call to i32 + %call4 = call contract float %fp(i32 %conv) + store float %call4, float* %result, align 4 + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "target-features"="+16-bit-insts,+add-no-carry-insts,+aperture-regs,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+ds-src2-insts,+enable-ds128,+enable-prt-strict-null,+fast-denormal-f32,+fast-fmaf,+flat-address-space,+flat-for-global,+flat-global-insts,+flat-inst-offsets,+flat-scratch-insts,+fma-mix-insts,+fp64,+gcn3-encoding,+gfx7-gfx8-gfx9-insts,+gfx8-insts,+gfx9,+gfx9-insts,+half-rate-64-ops,+image-gather4-d16-bug,+int-clamp-insts,+inv-2pi-inline-imm,+ldsbankcount32,+load-store-opt,+localmemorysize65536,+mad-mac-f32-insts,+no-xnack-support,+promote-alloca,+r128-a16,+s-memrealtime,+s-memtime-inst,+scalar-atomics,+scalar-flat-scratch-insts,+scalar-stores,+sdwa,+sdwa-omod,+sdwa-scalar,+sdwa-sdst,+sram-ecc,+trap-handler,+unaligned-access-mode,+unaligned-buffer-access,+unaligned-ds-access,+vgpr-index-mode,+vop3p,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64,+xnack" } +attributes #1 = { convergent norecurse nounwind mustprogress "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" "amdgpu-unsafe-fp-atomics"="true" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll @@ -0,0 +1,68 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" + +; Test to check if we skip propgating attributes even if +; a function is called directly as well as +; indirectly. "baz" is called directly as well indirectly. + +; CHECK-LABEL: define float @foo(i32 %a) { +define float @foo(i32 %a) { +entry: + %mul = mul nsw i32 %a, 5 + %cast = sitofp i32 %mul to float + + ret float %cast +} + +; CHECK-LABEL: define float @bar(i32 %a) { +define float @bar(i32 %a) { +entry: + %div = sdiv i32 %a, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +; CHECK-LABEL: define float @baz(i32 %a) { +define float @baz(i32 %a) { +entry: + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 { +entry: + %fn = alloca float (i32)*, align 8, addrspace(5) + switch i32 %type, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: + store float (i32)* @foo, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb2: + store float (i32)* @bar, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb3: + store float (i32)* @baz, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.default: + br label %sw.epilog + +sw.epilog: + %fp = load float (i32)*, float (i32)* addrspace(5)* %fn, align 8 + %direct_call = call contract float @baz(i32 4) + %conv = fptosi float %direct_call to i32 + %call4 = call contract float %fp(i32 %conv) + store float %call4, float* %result, align 4 + ret void +} + +attributes #1 = { convergent norecurse nounwind mustprogress "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" "amdgpu-unsafe-fp-atomics"="true" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect.ll @@ -0,0 +1,78 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" + +; Test to check if we skip attributes on address +; taken functions but pass to direct callees. + +; CHECK-LABEL: define float @foo(i32 %a) { +define float @foo(i32 %a) { +entry: + %mul = mul nsw i32 %a, 5 + %cast = sitofp i32 %mul to float + + ret float %cast +} + +; CHECK-LABEL: define float @bar(i32 %a) { +define float @bar(i32 %a) { +entry: + %div = sdiv i32 %a, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +; CHECK-LABEL: define float @baz(i32 %a) { +define float @baz(i32 %a) { +entry: + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +; CHECK-LABEL: define float @baz2(i32 %a) #0 { +define float @baz2(i32 %a) { + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 8 + %add = add i32 %div , 12 + %conv = sitofp i32 %add to float + ret float %conv +} + +define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 { +entry: + %fn = alloca float (i32)*, align 8, addrspace(5) + switch i32 %type, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: + store float (i32)* @foo, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb2: + store float (i32)* @bar, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb3: + store float (i32)* @baz, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.default: + br label %sw.epilog + +sw.epilog: + %fp = load float (i32)*, float (i32)* addrspace(5)* %fn, align 8 + %direct_call = call contract float @baz2(i32 5) + %conv = fptosi float %direct_call to i32 + %call4 = call contract float %fp(i32 %conv) + store float %call4, float* %result, align 4 + ret void +} + + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "target-features"="+16-bit-insts,+add-no-carry-insts,+aperture-regs,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+ds-src2-insts,+enable-ds128,+enable-prt-strict-null,+fast-denormal-f32,+fast-fmaf,+flat-address-space,+flat-for-global,+flat-global-insts,+flat-inst-offsets,+flat-scratch-insts,+fma-mix-insts,+fp64,+gcn3-encoding,+gfx7-gfx8-gfx9-insts,+gfx8-insts,+gfx9,+gfx9-insts,+half-rate-64-ops,+image-gather4-d16-bug,+int-clamp-insts,+inv-2pi-inline-imm,+ldsbankcount32,+load-store-opt,+localmemorysize65536,+mad-mac-f32-insts,+no-xnack-support,+promote-alloca,+r128-a16,+s-memrealtime,+s-memtime-inst,+scalar-atomics,+scalar-flat-scratch-insts,+scalar-stores,+sdwa,+sdwa-omod,+sdwa-scalar,+sdwa-sdst,+sram-ecc,+trap-handler,+unaligned-access-mode,+unaligned-buffer-access,+unaligned-ds-access,+vgpr-index-mode,+vop3p,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64,+xnack" } +attributes #1 = { convergent norecurse nounwind mustprogress "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" "amdgpu-unsafe-fp-atomics"="true" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll @@ -0,0 +1,66 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" + +; Test to check if we skip attributes on address +; taken functions in a simple call graph. + +; CHECK-LABEL: define float @foo(i32 %a) { +define float @foo(i32 %a) { +entry: + %mul = mul nsw i32 %a, 5 + %cast = sitofp i32 %mul to float + + ret float %cast +} + +; CHECK-LABEL: define float @bar(i32 %a) { +define float @bar(i32 %a) { +entry: + %div = sdiv i32 %a, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +; CHECK-LABEL: define float @baz(i32 %a) { +define float @baz(i32 %a) { +entry: + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 { +entry: + %fn = alloca float (i32)*, align 8, addrspace(5) + switch i32 %type, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: + store float (i32)* @foo, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb2: + store float (i32)* @bar, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.bb3: + store float (i32)* @baz, float (i32)* addrspace(5)* %fn, align 8 + br label %sw.epilog + +sw.default: + br label %sw.epilog + +sw.epilog: + %fp = load float (i32)*, float (i32)* addrspace(5)* %fn, align 8 + %call4 = call contract float %fp(i32 7) + store float %call4, float* %result, align 4 + ret void +} + +attributes #1 = { convergent norecurse nounwind mustprogress "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" "amdgpu-unsafe-fp-atomics"="true" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +