diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -236,6 +236,15 @@ if (F.isDeclaration()) continue; + if (F.hasAddressTaken()) { + // Skip propagating attributes to address taken functions + // and just set features. + auto Features = TM->getSubtargetImpl(F)->getFeatureBits(); + FeatureBitset FB(Features & ~TargetFeatures); + setFeatures(F, FB); + continue; + } + const FnProperties CalleeProps(*TM, F); SmallVector, 32> ToReplace; SmallSet Visited; diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect.ll @@ -0,0 +1,91 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; CHECK: define hidden float @_Z3fooi(i32 %a) #0 { +define hidden float @_Z3fooi(i32 %a) { +entry: + %mul = mul nsw i32 %a, 5 + %cast = sitofp i32 %mul to float + + ret float %cast +} + +; CHECK: define hidden float @_Z3bari(i32 %a) #0 { +define hidden float @_Z3bari(i32 %a) { +entry: + %div = sdiv i32 %a, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +; CHECK: define hidden float @_Z3bazi(i32 %a) #0 { +define hidden float @_Z3bazi(i32 %a) { +entry: + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +; CHECK: define hidden float @_Z4baz2i(i32 %a) #1 { +define hidden float @_Z4baz2i(i32 %a) #0 { + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 8 + %add = add i32 %div , 12 + %conv = sitofp i32 %add to float + ret float %conv +} + +; CHECK: define protected amdgpu_kernel void @_Z22switch_indirect_kernelPfi(float addrspace(1)* %result.coerce, i32 %type) #2 { +define protected amdgpu_kernel void @_Z22switch_indirect_kernelPfi(float addrspace(1)* %result.coerce, i32 %type) #1 { +entry: + %result = alloca float*, align 8, addrspace(5) + %result.ascast = addrspacecast float* addrspace(5)* %result to float** + %result.addr = alloca float*, align 8, addrspace(5) + %result.addr.ascast = addrspacecast float* addrspace(5)* %result.addr to float** + %type.addr = alloca i32, align 4, addrspace(5) + %type.addr.ascast = addrspacecast i32 addrspace(5)* %type.addr to i32* + %fn = alloca float (i32)*, align 8, addrspace(5) + %fn.ascast = addrspacecast float (i32)* addrspace(5)* %fn to float (i32)** + %P = alloca float, align 4, addrspace(5) + %P.ascast = addrspacecast float addrspace(5)* %P to float* + %0 = addrspacecast float addrspace(1)* %result.coerce to float* + store float* %0, float** %result.ascast, align 8 + %result1 = load float*, float** %result.ascast, align 8 + store float* %result1, float** %result.addr.ascast, align 8 + %1 = bitcast float (i32)* addrspace(5)* %fn to i8 addrspace(5)* + %2 = load i32, i32* %type.addr.ascast, align 4 + switch i32 %2, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: ; preds = %entry + store float (i32)* @_Z3fooi, float (i32)** %fn.ascast, align 8 + br label %sw.epilog + +sw.bb2: ; preds = %entry + store float (i32)* @_Z3bari, float (i32)** %fn.ascast, align 8 + br label %sw.epilog + +sw.bb3: ; preds = %entry + store float (i32)* @_Z3bazi, float (i32)** %fn.ascast, align 8 + br label %sw.epilog + +sw.default: ; preds = %entry + br label %sw.epilog + +sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb + %fp = load float (i32)*, float (i32)** %fn.ascast, align 8 + %direct_call = call contract float @_Z4baz2i(i32 5) + %conv = fptosi float %direct_call to i32 + %call4 = call contract float %fp(i32 %conv) + %res = load float*, float** %result.addr.ascast, align 8 + store float %call4, float* %res, align 4 + ret void +} + +attributes #1 = { convergent norecurse nounwind mustprogress "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" "amdgpu-unsafe-fp-atomics"="true" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll @@ -0,0 +1,80 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-early %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; CHECK: define hidden float @_Z3fooi(i32 %a) #0 { +define hidden float @_Z3fooi(i32 %a) { +entry: + %mul = mul nsw i32 %a, 5 + %cast = sitofp i32 %mul to float + + ret float %cast +} + +; CHECK: define hidden float @_Z3bari(i32 %a) #0 { +define hidden float @_Z3bari(i32 %a) { +entry: + %div = sdiv i32 %a, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +; CHECK: define hidden float @_Z3bazi(i32 %a) #0 { +define hidden float @_Z3bazi(i32 %a) { +entry: + %mul = mul nsw i32 %a, 6 + %div = sdiv i32 %mul, 7 + %conv = sitofp i32 %div to float + ret float %conv +} + +; CHECK: define protected amdgpu_kernel void @_Z22switch_indirect_kernelPfi(float addrspace(1)* %result.coerce, i32 %type) #1 { +define protected amdgpu_kernel void @_Z22switch_indirect_kernelPfi(float addrspace(1)* %result.coerce, i32 %type) #1 { +entry: + %result = alloca float*, align 8, addrspace(5) + %result.ascast = addrspacecast float* addrspace(5)* %result to float** + %result.addr = alloca float*, align 8, addrspace(5) + %result.addr.ascast = addrspacecast float* addrspace(5)* %result.addr to float** + %type.addr = alloca i32, align 4, addrspace(5) + %type.addr.ascast = addrspacecast i32 addrspace(5)* %type.addr to i32* + %fn = alloca float (i32)*, align 8, addrspace(5) + %fn.ascast = addrspacecast float (i32)* addrspace(5)* %fn to float (i32)** + %P = alloca float, align 4, addrspace(5) + %P.ascast = addrspacecast float addrspace(5)* %P to float* + %0 = addrspacecast float addrspace(1)* %result.coerce to float* + store float* %0, float** %result.ascast, align 8 + %result1 = load float*, float** %result.ascast, align 8 + store float* %result1, float** %result.addr.ascast, align 8 + %1 = bitcast float (i32)* addrspace(5)* %fn to i8 addrspace(5)* + %2 = load i32, i32* %type.addr.ascast, align 4 + switch i32 %2, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb2 + i32 3, label %sw.bb3 + ] + +sw.bb: ; preds = %entry + store float (i32)* @_Z3fooi, float (i32)** %fn.ascast, align 8 + br label %sw.epilog + +sw.bb2: ; preds = %entry + store float (i32)* @_Z3bari, float (i32)** %fn.ascast, align 8 + br label %sw.epilog + +sw.bb3: ; preds = %entry + store float (i32)* @_Z3bazi, float (i32)** %fn.ascast, align 8 + br label %sw.epilog + +sw.default: ; preds = %entry + br label %sw.epilog + +sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb + %fp = load float (i32)*, float (i32)** %fn.ascast, align 8 + %call4 = call contract float %fp(i32 6) + %res = load float*, float** %result.addr.ascast, align 8 + store float %call4, float* %res, align 4 + ret void +} + +attributes #1 = { convergent norecurse nounwind mustprogress "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" "amdgpu-unsafe-fp-atomics"="true" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }