diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2263,6 +2263,18 @@ // certainty what the source behavior is without more context on how // the src is lowered. e.g. fptrunc + fma may be lowered to a // v_fma_mix* instruction which does not zero, or may not. +def : GCNPat< + (i32 (DivergentUnaryFrag i32:$src)), + (V_MAX_I32_e64 (V_SUB_CO_U32_e32 (i32 0), $src), $src)>; + +let AddedComplexity = 1 in { +def : GCNPat< + (i32 (DivergentUnaryFrag i32:$src)), + (V_MAX_I32_e64 (V_SUB_U32_e32 (i32 0), $src), $src)>{ + let SubtargetPredicate = HasAddNoCarryInsts; +} +} // AddedComplexity = 1 + def : GCNPat< (i32 (DivergentUnaryFrag i16:$src)), (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -341,7 +341,7 @@ let Defs = [SCC] in { def S_ABS_I32 : SOP1_32 <"s_abs_i32", - [(set i32:$sdst, (abs i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag i32:$src0))] >; } // End Defs = [SCC] @@ -1377,7 +1377,7 @@ >; def : GCNPat < - (i32 (smax i32:$x, (i32 (ineg i32:$x)))), + (i32 (UniformBinFrag i32:$x, (i32 (ineg i32:$x)))), (S_ABS_I32 SReg_32:$x) >; diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll @@ -0,0 +1,71 @@ +; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s + +; FUNC-LABEL: {{^}}v_abs_i32: +; GCN: S_ABS_I32 +define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind { + %neg = sub i32 0, %val + %cond = icmp sgt i32 %val, %neg + %res = select i1 %cond, i32 %val, i32 %neg + %res2 = add i32 %res, 2 + store i32 %res2, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_abs_i32: +; SI: V_SUB_CO_U32_e64 +; GFX900: V_SUB_U32_e64 +; GCN: V_MAX_I32_e64 +define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %src, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in, align 4 + %neg = sub i32 0, %val + %cond = icmp sgt i32 %val, %neg + %res = select i1 %cond, i32 %val, i32 %neg + %res2 = add i32 %res, 2 + store i32 %res2, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_abs_v2i32: +; GCN: S_ABS_I32 +; GCN: S_ABS_I32 +define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind { + %z0 = insertelement <2 x i32> undef, i32 0, i32 0 + %z1 = insertelement <2 x i32> %z0, i32 0, i32 1 + %t0 = insertelement <2 x i32> undef, i32 2, i32 0 + %t1 = insertelement <2 x i32> %t0, i32 2, i32 1 + %neg = sub <2 x i32> %z1, %val + %cond = icmp sgt <2 x i32> %val, %neg + %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg + %res2 = add <2 x i32> %res, %t1 + store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_abs_v2i32: +; SI: V_SUB_CO_U32_e64 +; GFX900: V_SUB_U32_e64 +; GCN: V_MAX_I32_e64 +; GCN: V_MAX_I32_e64 +define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind { + %z0 = insertelement <2 x i32> undef, i32 0, i32 0 + %z1 = insertelement <2 x i32> %z0, i32 0, i32 1 + %t0 = insertelement <2 x i32> undef, i32 2, i32 0 + %t1 = insertelement <2 x i32> %t0, i32 2, i32 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %src, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in, align 4 + %neg = sub <2 x i32> %z1, %val + %cond = icmp sgt <2 x i32> %val, %neg + %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg + %res2 = add <2 x i32> %res, %t1 + store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind }