diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1552,6 +1552,17 @@ // BFI patterns +def BFIImm32 : PatFrag< + (ops node:$x, node:$y, node:$z), + (i32 (DivergentBinFrag (and node:$y, node:$x), (and node:$z, imm))), + [{ + auto *X = dyn_cast(N->getOperand(0)->getOperand(1)); + auto *NotX = dyn_cast(N->getOperand(1)->getOperand(1)); + return X && NotX && + ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue(); + }] +>; + // Definition from ISA doc: // (y & x) | (z & ~x) def : AMDGPUPat < @@ -1559,6 +1570,12 @@ (V_BFI_B32 $x, $y, $z) >; +// (y & C) | (z & ~C) +def : AMDGPUPat < + (BFIImm32 i32:$x, i32:$y, i32:$z), + (V_BFI_B32 $x, $y, $z) +>; + // 64-bit version def : AMDGPUPat < (DivergentBinFrag (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1285,11 +1285,11 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_and_b32 s0, s4, 0xffff +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v0, s0, v0 +; VI-NEXT: v_bfi_b32 v0, s0, v4, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1305,11 +1305,11 @@ ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_and_b32 s0, s4, 0xffff +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_or_b32_e32 v0, s0, v0 +; CI-NEXT: v_bfi_b32 v0, s0, v4, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1415,11 +1415,11 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_and_b32 s0, s4, 0xffff +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v1, s0, v1 +; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1435,11 +1435,11 @@ ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_and_b32 s0, s4, 0xffff +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_or_b32_e32 v1, s0, v1 +; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1545,11 +1545,11 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_and_b32 s0, s4, 0xffff +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v1, s0, v1 +; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1565,11 +1565,11 @@ ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_and_b32 s0, s4, 0xffff +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_or_b32_e32 v1, s0, v1 +; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -905,10 +905,9 @@ ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff0000 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -978,10 +977,9 @@ ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, -7, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, -7, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff0000 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -1052,10 +1050,9 @@ ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff0000 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xff850000, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -1127,10 +1124,9 @@ ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, -7, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, -7, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_bfi_b32 v2, s4, v3, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -1406,10 +1402,9 @@ ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff0000 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -1547,10 +1542,9 @@ ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_bfi_b32 v2, s4, v3, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -1619,10 +1613,9 @@ ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff0000 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -1760,10 +1753,9 @@ ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_bfi_b32 v2, s4, v3, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -1831,10 +1823,9 @@ ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffc400, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0xffffc400, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff0000 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0xc4000000, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -1906,10 +1897,9 @@ ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4400, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4400, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff0000 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44000000, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -1981,10 +1971,9 @@ ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4000, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff0000 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, 2.0, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -2056,10 +2045,9 @@ ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffc000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0xffffc000, v2 +; SI-NEXT: s_mov_b32 s4, 0xffff0000 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, -2.0, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm