diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2269,6 +2269,40 @@ (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) >; +// Restrict the range to prevent using an additional VGPR +// for the shifted value. +def IMMBitSelRange : ImmLeaf 0 && Imm < 16; +}]>; + +def IMMBitSelConst : SDNodeXFormgetTargetConstant((1 << N->getZExtValue()), SDLoc(N), + MVT::i32); +}]>; + +// Matching separate SRL and TRUNC instructions +// with dependent operands (SRL dest is source of TRUNC) +// generates three instructions. However, by using bit shifts, +// the V_LSHRREV_B32_e64 result can be directly used in the +// operand of the V_AND_B32_e64 instruction: +// (trunc i32 (srl i32 $a, i32 $b)) -> +// v_and_b32_e64 $a, (1 << $b), $a +// v_cmp_eq_u32_e64 $a, (1 << $b), $a + +// Handle the VALU case. +def : GCNPat < + (i1 (DivergentUnaryFrag (i32 (srl i32:$a, IMMBitSelRange:$b)))), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a), + (i32 (IMMBitSelConst $b))) +>; + +// Handle the scalar case. +def : GCNPat < + (i1 (UniformUnaryFrag (i32 (srl i32:$a, IMMBitSelRange:$b)))), + (S_CMP_EQ_U32 (S_AND_B32 (i32 (IMMBitSelConst $b)), $a), + (i32 (IMMBitSelConst $b))) +>; + def : GCNPat < (i1 (DivergentUnaryFrag i64:$a)), (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll @@ -0,0 +1,40 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -stop-after=amdgpu-isel -verify-machineinstrs -O0 < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: bb.0.entry: +; GCN-NOT: V_LSHRREV_B32_e64 +; GCN: V_AND_B32_e64 2 +; GCN: V_CMP_EQ_U32_e64 killed {{.*}}, 2 +define i32 @divergent_lshr_and_cmp(i32 %x) { +entry: + %0 = and i32 %x, 2 + %1 = icmp ne i32 %0, 0 + ; Prevent removal of truncate in SDag by inserting llvm.amdgcn.if + br i1 %1, label %out.true, label %out.else + +out.true: + %2 = shl i32 %x, 2 + ret i32 %2 + +out.else: + ret i32 %x +} + +; GCN-LABEL: bb.0.entry: +; GCN: S_AND_B32 2 +; GCN: S_CMP_EQ_U32 killed %{{.*}}, 2 +define amdgpu_kernel void @uniform_opt_lshr_and_cmp(i1 addrspace(1)* %out, i32 %x) { +entry: + %0 = and i32 %x, 2 + %1 = icmp ne i32 %0, 0 + ; Don't optimize the truncate in the SDag away. + br i1 %1, label %out.true, label %out.else + +out.true: + %2 = xor i1 %1, -1 + store i1 %2, i1 addrspace(1)* %out + ret void + +out.else: + store i1 %1, i1 addrspace(1)* %out + ret void +} \ No newline at end of file