diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1903,6 +1903,25 @@ }] >; +// Create two BFI instructions at once, if possible. +// This tries to handle one-level deep nested bitfieldInserts: +// +// ((src << numBits) ^ y) & imm0) ^ bfi(x, y, z)) => +// v_bfi (imm0, lshlrev(numBits, src), bfi(x, y, z)) +// +// Such sequences can occur after InstCombine: +// A (xor (and imm0, (xor (shl), (xor (and (xor (shl)), imm1)))) has two +// BFI parts. The outer BFI part relies on the inner BFI part. +// During InstCombine, the inner xor sequence gets turned into +// bfi_0 = (y & x) | (z & ~x) and later to a BFI, while the outer BFI part +// stays untouched and will not be converted into a BFI instruction. +def : AMDGPUPat < + (DivergentBinFrag (and (xor (shl i32:$src, (i32 imm:$numBits)), i32:$y), (i32 imm:$imm0)), + (BFIImm32 i32:$x, i32:$y, i32:$z)), + (V_BFI_B32_e64 VSrc_b32:$imm0, (V_LSHLREV_B32_e64 i32:$numBits, i32:$src), + (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)) +>; + // Definition from ISA doc: // (y & x) | (z & ~x) def : AMDGPUPat < diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -1912,34 +1912,30 @@ ; GFX7-LABEL: v_bfi_seq_i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 20, v0 ; GFX7-NEXT: s_mov_b32 s4, 0xffc00 -; GFX7-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_bfi_b32 v2, s4, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GFX7-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 20, v0 +; GFX7-NEXT: s_mov_b32 s4, 0x3ff00000 +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_bfi_seq_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 20, v0 ; GFX8-NEXT: s_mov_b32 s4, 0xffc00 -; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_bfi_b32 v2, s4, v1, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 20, v0 +; GFX8-NEXT: s_mov_b32 s4, 0x3ff00000 +; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_bfi_seq_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0 -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfi_b32 v1, 0xffc00, v1, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0x3ff00000, v0 -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0 +; GFX10-NEXT: v_bfi_b32 v0, 0x3ff00000, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_bfi_seq_i32: