diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2784,20 +2784,21 @@ (S_MOV_B32 SReg_32:$src) >; -multiclass BFMPatterns { +multiclass BFMPatterns { def : GCNPat < - (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), + (vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), (BFM $a, $b) >; def : GCNPat < - (vt (add (vt (shl 1, vt:$a)), -1)), - (BFM $a, (MOV (i32 0))) + (vt (ADD (vt (shl 1, vt:$a)), -1)), + (BFM $a, (i32 0)) >; } -defm : BFMPatterns ; -// FIXME: defm : BFMPatterns ; +defm : BFMPatterns , UniformBinFrag, S_BFM_B32>; +// FIXME: defm : BFMPatterns , UniformBinFrag, S_BFM_B64>; +defm : BFMPatterns , DivergentBinFrag, V_BFM_B32_e64>; // Bitfield extract patterns diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll --- a/llvm/test/CodeGen/AMDGPU/bfm.ll +++ b/llvm/test/CodeGen/AMDGPU/bfm.ll @@ -1,10 +1,31 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s -; FUNC-LABEL: {{^}}bfm_pattern: -; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -define amdgpu_kernel void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { +define amdgpu_kernel void @s_bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { +; SI-LABEL: s_bfm_pattern: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfm_b32 s4, s4, s5 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_bfm_pattern: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bfm_b32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm %a = shl i32 1, %x %b = sub i32 %a, 1 %c = shl i32 %b, %y @@ -12,9 +33,83 @@ ret void } -; FUNC-LABEL: {{^}}bfm_pattern_simple: -; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0 -define amdgpu_kernel void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 { +define amdgpu_kernel void @s_bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 { +; SI-LABEL: s_bfm_pattern_simple: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfm_b32 s4, s2, 0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_bfm_pattern_simple: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bfm_b32 s2, s2, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm + %a = shl i32 1, %x + %b = sub i32 %a, 1 + store i32 %b, i32 addrspace(1)* %out + ret void +} + +define void @v_bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { +; SI-LABEL: v_bfm_pattern: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_bfm_b32_e32 v2, v2, v3 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bfm_pattern: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_bfm_b32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] + %a = shl i32 1, %x + %b = sub i32 %a, 1 + %c = shl i32 %b, %y + store i32 %c, i32 addrspace(1)* %out + ret void +} + +define void @v_bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 { +; SI-LABEL: v_bfm_pattern_simple: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_bfm_b32_e64 v2, v2, 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_bfm_pattern_simple: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_bfm_b32 v2, v2, 0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] %a = shl i32 1, %x %b = sub i32 %a, 1 store i32 %b, i32 addrspace(1)* %out