diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2246,7 +2246,7 @@ let HasExtSDWA9 = 0; } -class VOP_PAT_GEN : VOPProfile { +class VOP_PAT_GEN : VOPProfile { let NeedPatGen = mode; } diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -40,8 +40,8 @@ ; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]] ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; GCN-LABEL: {{^}}v_ubfe_sub_i32: ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]] @@ -24,11 +24,8 @@ ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]] ; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]] -; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]] -; SI-NEXT: v_lshr_b32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]] - -; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] -; VI-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] +; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] +; GCN-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] ; GCN: [[BFE]] ; GCN: [[SHL]] @@ -101,11 +98,8 @@ ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]] ; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]] -; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]] -; SI-NEXT: v_ashr_i32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]] - -; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] -; VI-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] +; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] +; GCN-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] ; GCN: [[BFE]] ; GCN: [[SHL]] diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll --- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll @@ -17,7 +17,7 @@ ; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm ; SI-NEXT: v_and_b32_e32 v0, 7, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshr_b32_e32 v0, v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -730,7 +730,7 @@ ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v14, s0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v15, s0, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 @@ -773,7 +773,6 @@ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xffff ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -786,23 +785,23 @@ ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; VI-NEXT: v_and_b32_e32 v3, v8, v3 -; VI-NEXT: v_and_b32_e32 v2, v8, v2 -; VI-NEXT: v_and_b32_e32 v1, v8, v1 -; VI-NEXT: v_and_b32_e32 v0, v8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; VI-NEXT: v_and_b32_e32 v3, s0, v3 +; VI-NEXT: v_and_b32_e32 v2, s0, v2 +; VI-NEXT: v_and_b32_e32 v1, s0, v1 +; VI-NEXT: v_and_b32_e32 v0, s0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; VI-NEXT: v_bcnt_u32_b32 v8, v8, 0 ; VI-NEXT: v_bcnt_u32_b32 v9, v9, 0 ; VI-NEXT: v_bcnt_u32_b32 v10, v10, 0 ; VI-NEXT: v_bcnt_u32_b32 v11, v11, 0 -; VI-NEXT: v_bcnt_u32_b32 v12, v12, 0 ; VI-NEXT: v_and_b32_e32 v7, s0, v7 ; VI-NEXT: v_and_b32_e32 v6, s0, v6 ; VI-NEXT: v_and_b32_e32 v5, s0, v5 @@ -811,27 +810,27 @@ ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 ; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 -; VI-NEXT: v_bcnt_u32_b32 v8, v8, 0 +; VI-NEXT: v_bcnt_u32_b32 v12, v12, 0 ; VI-NEXT: v_bcnt_u32_b32 v13, v13, 0 ; VI-NEXT: v_bcnt_u32_b32 v14, v14, 0 ; VI-NEXT: v_bcnt_u32_b32 v15, v15, 0 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_bcnt_u32_b32 v7, v7, 0 ; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 ; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0 ; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_or_b32_e32 v3, v3, v9 -; VI-NEXT: v_or_b32_e32 v2, v2, v10 -; VI-NEXT: v_or_b32_e32 v1, v1, v11 -; VI-NEXT: v_or_b32_e32 v0, v0, v12 -; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_or_b32_e32 v3, v3, v8 +; VI-NEXT: v_or_b32_e32 v2, v2, v9 +; VI-NEXT: v_or_b32_e32 v1, v1, v10 +; VI-NEXT: v_or_b32_e32 v0, v0, v11 +; VI-NEXT: v_or_b32_e32 v7, v7, v12 ; VI-NEXT: v_or_b32_e32 v6, v6, v13 ; VI-NEXT: v_or_b32_e32 v5, v5, v14 ; VI-NEXT: v_or_b32_e32 v4, v4, v15 diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll --- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll @@ -166,8 +166,8 @@ ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 -; SI-NEXT: v_lshl_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshr_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bzhi32_d1_indexzext: diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -500,9 +500,9 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s32 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: scratch_store_dword v2, v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 @@ -514,14 +514,14 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: v_mov_b32_e32 v2, s32 -; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 -; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 -; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: v_mov_b32_e32 v1, s32 +; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 +; GFX10-NEXT: scratch_store_dword v0, v3, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc +; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -529,9 +529,9 @@ ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 @@ -543,14 +543,14 @@ ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s32 -; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 -; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 -; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s32 +; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 +; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc +; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: @@ -1247,9 +1247,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 ; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi -; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: scratch_store_dword v2, v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 @@ -1261,17 +1261,17 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 -; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 -; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 -; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc +; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 +; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: scratch_store_dword v0, v3, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc +; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1282,9 +1282,9 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x100 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 @@ -1296,17 +1296,17 @@ ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 -; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo -; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 -; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 -; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc +; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 +; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc +; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: @@ -2019,9 +2019,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 ; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi -; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: scratch_store_dword v2, v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 @@ -2033,17 +2033,17 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 -; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 -; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc +; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 +; GFX10-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v1, off +; GFX10-NEXT: scratch_store_dword v0, v3, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc +; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2054,9 +2054,9 @@ ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 -; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 @@ -2068,17 +2068,17 @@ ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo -; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 -; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 -; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc +; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo +; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 +; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc +; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -339,57 +339,56 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v1, v3, 0, 4 -; GFX7-NEXT: v_bfe_i32 v4, v3, 4, 4 -; GFX7-NEXT: v_bfe_i32 v5, v3, 8, 4 -; GFX7-NEXT: v_bfe_i32 v6, v3, 12, 4 -; GFX7-NEXT: v_bfe_i32 v7, v3, 16, 4 -; GFX7-NEXT: v_bfe_i32 v8, v3, 20, 4 -; GFX7-NEXT: v_bfe_i32 v9, v3, 24, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 -; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v2, v9 -; GFX7-NEXT: v_and_b32_e32 v3, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX7-NEXT: v_and_b32_e32 v16, v2, v16 -; GFX7-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 +; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v9, v16 +; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 ; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 +; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 ; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 +; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 ; GFX7-NEXT: v_and_b32_e32 v13, s4, v13 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 ; GFX7-NEXT: v_and_b32_e32 v14, s4, v14 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v10, v2 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 +; GFX7-NEXT: v_and_b32_e32 v15, s4, v15 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 -; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -913,57 +912,56 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff +; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v1, v3, 0, 4 -; GFX7-NEXT: v_bfe_i32 v4, v3, 4, 4 -; GFX7-NEXT: v_bfe_i32 v5, v3, 8, 4 -; GFX7-NEXT: v_bfe_i32 v6, v3, 12, 4 -; GFX7-NEXT: v_bfe_i32 v7, v3, 16, 4 -; GFX7-NEXT: v_bfe_i32 v8, v3, 20, 4 -; GFX7-NEXT: v_bfe_i32 v9, v3, 24, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 -; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v2, v9 -; GFX7-NEXT: v_and_b32_e32 v3, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX7-NEXT: v_and_b32_e32 v16, v2, v16 -; GFX7-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX7-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 +; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v9, v16 +; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 ; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 +; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 ; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 +; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 ; GFX7-NEXT: v_and_b32_e32 v13, s4, v13 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 ; GFX7-NEXT: v_and_b32_e32 v14, s4, v14 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v10, v2 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 +; GFX7-NEXT: v_and_b32_e32 v15, s4, v15 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 -; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2201,69 +2199,68 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v1, v3, 20, 4 -; GFX7-NEXT: v_bfe_i32 v4, v3, 16, 4 -; GFX7-NEXT: v_bfe_i32 v5, v3, 4, 4 -; GFX7-NEXT: v_bfe_i32 v6, v3, 0, 4 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_bfe_i32 v1, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 0, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_i32 v9, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v10, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 0, 4 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; GFX7-NEXT: v_and_b32_e32 v10, s4, v12 +; GFX7-NEXT: v_bfe_i32 v13, v0, 24, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v15, 28, v0 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v10, v9 +; GFX7-NEXT: v_and_b32_e32 v11, s4, v13 +; GFX7-NEXT: v_and_b32_e32 v13, s4, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v10, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 4, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; GFX7-NEXT: v_and_b32_e32 v11, v2, v13 -; GFX7-NEXT: v_bfe_i32 v7, v3, 24, 4 -; GFX7-NEXT: v_bfe_i32 v8, v3, 8, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v3 -; GFX7-NEXT: v_bfe_i32 v3, v3, 12, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 24, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0 +; GFX7-NEXT: v_mad_u32_u24 v3, v3, v5, v16 +; GFX7-NEXT: v_bfe_i32 v6, v2, 24, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2 +; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX7-NEXT: v_or_b32_e32 v6, v11, v10 -; GFX7-NEXT: v_and_b32_e32 v3, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v9, v2, v9 -; GFX7-NEXT: v_and_b32_e32 v12, v2, v14 -; GFX7-NEXT: v_and_b32_e32 v13, v2, v15 -; GFX7-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_and_b32_e32 v14, v2, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v5 -; GFX7-NEXT: buffer_load_ushort v5, off, s[0:3], 0 +; GFX7-NEXT: v_and_b32_e32 v12, s4, v14 +; GFX7-NEXT: v_mad_u32_u24 v3, v15, v10, v3 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_mad_u32_u24 v3, v7, v12, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v4, v0 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_mad_u32_u24 v0, v14, v9, v0 ; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v4, v6, v5 -; GFX7-NEXT: v_mad_u32_u24 v4, v16, v11, v4 -; GFX7-NEXT: v_mad_u32_u24 v4, v8, v13, v4 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v4 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v15, v10, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v14, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v11, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v8, v13, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2813,95 +2810,93 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff ; GFX7-NEXT: s_mov_b32 s5, 0xffff -; GFX7-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v6, v4, 20, 4 -; GFX7-NEXT: v_bfe_i32 v7, v4, 16, 4 -; GFX7-NEXT: v_bfe_i32 v8, v4, 12, 4 -; GFX7-NEXT: v_bfe_i32 v9, v4, 8, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2 +; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v13, v0, 24, 4 -; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX7-NEXT: v_or_b32_e32 v7, v9, v8 -; GFX7-NEXT: v_and_b32_e32 v9, v2, v13 +; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v16, v0, 4, 4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX7-NEXT: v_and_b32_e32 v9, s4, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v16 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v4 -; GFX7-NEXT: v_bfe_i32 v5, v4, 24, 4 -; GFX7-NEXT: v_bfe_i32 v10, v4, 4, 4 -; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v1 +; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 +; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4 +; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4 +; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_and_b32_e32 v4, v2, v4 -; GFX7-NEXT: v_ashrrev_i32_e32 v12, 28, v0 -; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v17, v0, 8, 4 -; GFX7-NEXT: v_bfe_i32 v18, v0, 4, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_ashrrev_i32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_i32 v11, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 12, 4 +; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v11 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v10 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v10 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v14 -; GFX7-NEXT: v_and_b32_e32 v11, v2, v15 -; GFX7-NEXT: v_and_b32_e32 v14, v2, v17 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v18 -; GFX7-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s5, v6 -; GFX7-NEXT: v_or_b32_e32 v8, v9, v8 -; GFX7-NEXT: v_or_b32_e32 v9, v11, v10 -; GFX7-NEXT: v_or_b32_e32 v10, v14, v13 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v14 +; GFX7-NEXT: v_and_b32_e32 v12, s4, v15 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v4, s5, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX7-NEXT: v_or_b32_e32 v7, v9, v8 +; GFX7-NEXT: v_or_b32_e32 v8, v12, v11 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX7-NEXT: v_and_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v7 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v9 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v6 -; GFX7-NEXT: v_and_b32_e32 v7, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v13, v2, v0 -; GFX7-NEXT: v_bfe_u32 v8, v4, 8, 8 -; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v4 -; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v12, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v5, s5, v7 +; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 8 +; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 8 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_and_b32_e32 v12, v2, v12 -; GFX7-NEXT: v_and_b32_e32 v9, v2, v5 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_bfe_u32 v10, v5, 8, 8 -; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v14, s4, v4 +; GFX7-NEXT: v_bfe_u32 v9, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v15, v4, 8, 8 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v7, v7, v13, v16 -; GFX7-NEXT: v_mad_u32_u24 v7, v8, v14, v7 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v0, v7 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v10, v15, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v12, v0 +; GFX7-NEXT: v_mad_u32_u24 v6, v6, v12, v16 +; GFX7-NEXT: v_mad_u32_u24 v6, v7, v13, v6 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v6 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v11, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v8, v14, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v15, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2481,55 +2481,54 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xf00 -; GFX7-NEXT: v_mov_b32_e32 v3, 0xf00 ; GFX7-NEXT: s_movk_i32 s5, 0xf0f ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 28, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 12, v2 ; GFX7-NEXT: v_bfe_u32 v1, v2, 8, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 4, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 12, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 28, v2 +; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 4, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 4, v0 -; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 24 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v0 -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_and_b32_e32 v6, v3, v9 -; GFX7-NEXT: v_and_b32_e32 v3, v3, v11 -; GFX7-NEXT: v_and_b32_e32 v12, 15, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 28, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 28, v0 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v11, 15, v0 +; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 12, v0 +; GFX7-NEXT: v_alignbit_b32 v2, v5, v2, 24 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 4, v0 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_alignbit_b32 v0, v12, v0, 24 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v10 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v14 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 12, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 -; GFX7-NEXT: v_alignbit_b32 v0, v13, v0, 24 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v15 +; GFX7-NEXT: v_or_b32_e32 v3, v13, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 15, v1 ; GFX7-NEXT: v_and_b32_e32 v12, 15, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v14, v4 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 4 ; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v6, v6, v12, v16 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 4 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v3 diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll --- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK --check-prefix=PRE-GFX8 %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK --check-prefix=GFX8 %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s ; CHECK-LABEL: {{^}}inline_asm: ; CHECK: s_endpgm @@ -260,8 +260,7 @@ ; CHECK: ; def v0 ; CHECK: v_mov_b32_e32 v1, v0 ; CHECK: ; def v0 -; PRE-GFX8: v_lshl_b32_e32 v{{[0-9]+}}, v1, v0 -; GFX8: v_lshlrev_b32_e32 v{{[0-9]+}}, v0, v1 +; CHECK: v_lshlrev_b32_e32 v{{[0-9]+}}, v0, v1 define amdgpu_kernel void @muliple_def_phys_vgpr() { entry: %def0 = call i32 asm sideeffect "; def $0 ", "={v0}"() diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -711,7 +711,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0x3e70000 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; @@ -769,7 +769,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0xfff10000 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; @@ -945,7 +945,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0x45000000 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; @@ -1003,7 +1003,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0x230000 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; @@ -1343,7 +1343,7 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1473,7 +1473,7 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -161,9 +161,8 @@ ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 1, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; SI-NEXT: s_xor_b64 s[0:1], exec, s[4:5] @@ -189,9 +188,8 @@ ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[4:5] @@ -217,9 +215,8 @@ ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX10-32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-32-NEXT: v_and_b32_e32 v1, 1, v0 ; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0 ; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s2 @@ -245,9 +242,8 @@ ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX10-64-NEXT: s_mov_b64 s[2:3], exec ; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-64-NEXT: v_and_b32_e32 v1, 1, v0 ; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -123,8 +123,8 @@ ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; CI-NEXT: v_lshr_b32_e32 v2, v2, v3 -; CI-NEXT: v_lshr_b32_e32 v3, v4, v5 +; CI-NEXT: v_lshrrev_b32_e32 v2, v3, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -512,10 +512,10 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; CI-NEXT: v_lshr_b32_e32 v3, v3, v5 -; CI-NEXT: v_lshr_b32_e32 v5, v7, v9 -; CI-NEXT: v_lshr_b32_e32 v2, v2, v4 -; CI-NEXT: v_lshr_b32_e32 v4, v6, v8 +; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, v9, v7 +; CI-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; CI-NEXT: v_lshrrev_b32_e32 v4, v8, v6 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -573,8 +573,7 @@ ; ; TODO: Why is the constant not peepholed into the v_or_b32_e32? ; -; NOSDWA: s_mov_b32 [[CONST:s[0-9]+]], 0x10000 -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, s0, +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000, ; SDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000, define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-xor.ll b/llvm/test/CodeGen/AMDGPU/select-constant-xor.ll --- a/llvm/test/CodeGen/AMDGPU/select-constant-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/select-constant-xor.ll @@ -105,7 +105,7 @@ ; CHECK-NEXT: v_bfe_i32 v0, v0, 0, 8 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x54 ; CHECK-NEXT: v_ashrrev_i16 v0, 7, v0 -; CHECK-NEXT: v_xor_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; CHECK-NEXT: v_xor_b32_sdwa v0, sext(v0), v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; CHECK-NEXT: s_setpc_b64 s[30:31] %c = icmp sgt i8 %a, -1 %s = select i1 %c, i32 84, i32 -85 diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -577,7 +577,7 @@ ; GCN: {{buffer|flat|global}}_load_ushort [[VAL0:v[0-9]+]] ; GCN: {{buffer|flat|global}}_load_ushort [[VAL1:v[0-9]+]] -; SI: v_lshl_b32_e32 [[REG:v[0-9]+]], [[VAL0]], [[VAL1]] +; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]] ; GFX89: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]] ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -397,7 +397,7 @@ ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_lshl_b32_e32 v0, v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, v0, v2 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -558,8 +558,8 @@ ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; SI-NEXT: v_lshl_b32_e32 v0, v2, v0 -; SI-NEXT: v_lshl_b32_e32 v1, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -655,10 +655,10 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; SI-NEXT: v_lshl_b32_e32 v3, v3, v5 -; SI-NEXT: v_lshl_b32_e32 v2, v2, v4 -; SI-NEXT: v_lshl_b32_e32 v4, v7, v9 -; SI-NEXT: v_lshl_b32_e32 v5, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, v9, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, v8, v6 ; SI-NEXT: v_and_b32_e32 v3, s0, v3 ; SI-NEXT: v_and_b32_e32 v2, s0, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -126,8 +126,8 @@ ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; CI-NEXT: v_lshl_b32_e32 v2, v2, v3 -; CI-NEXT: v_lshl_b32_e32 v3, v4, v5 +; CI-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, v5, v4 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -516,10 +516,10 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; CI-NEXT: v_lshl_b32_e32 v3, v3, v5 -; CI-NEXT: v_lshl_b32_e32 v2, v2, v4 -; CI-NEXT: v_lshl_b32_e32 v4, v7, v9 -; CI-NEXT: v_lshl_b32_e32 v5, v6, v8 +; CI-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; CI-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; CI-NEXT: v_lshlrev_b32_e32 v4, v9, v7 +; CI-NEXT: v_lshlrev_b32_e32 v5, v8, v6 ; CI-NEXT: v_and_b32_e32 v3, s0, v3 ; CI-NEXT: v_and_b32_e32 v2, s0, v2 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -525,7 +525,7 @@ ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -559,7 +559,7 @@ ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -613,39 +613,38 @@ ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v11 -; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v3, v16, v3 +; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v4, v16, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, s6, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v13 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v5, v16, v5 +; GFX6-NEXT: v_xor_b32_e32 v5, s6, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v14 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v6, v16, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, s6, v6 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v15 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v7, v16, v7 +; GFX6-NEXT: v_xor_b32_e32 v7, s6, v7 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -676,39 +675,38 @@ ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v3, v11 -; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v3, v16, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v4, v16, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v5, v13 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v5, v16, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, s6, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v6, v14 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v6, v16, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, s6, v6 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v7, v15 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v7, v16, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -772,93 +770,92 @@ ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v17, 1 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v3, v17, v3 +; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v4, v17, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, s6, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v5, v21 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v5, v17, v5 +; GFX6-NEXT: v_xor_b32_e32 v5, s6, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v6, v22 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v6, v17, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, s6, v6 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v7, v23 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v7, v17, v7 +; GFX6-NEXT: v_xor_b32_e32 v7, s6, v7 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v8, v24 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v8, v17, v8 +; GFX6-NEXT: v_xor_b32_e32 v8, s6, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v9, v25 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 ; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v9, v17, v9 +; GFX6-NEXT: v_xor_b32_e32 v9, s6, v9 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v10, v26 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 ; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v10, v17, v10 +; GFX6-NEXT: v_xor_b32_e32 v10, s6, v10 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v11, v27 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 ; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v11, v17, v11 +; GFX6-NEXT: v_xor_b32_e32 v11, s6, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v12, v28 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v12, v17, v12 +; GFX6-NEXT: v_xor_b32_e32 v12, s6, v12 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v13, v29 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 ; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v13, v17, v13 +; GFX6-NEXT: v_xor_b32_e32 v13, s6, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v14, v30 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 ; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v14, v17, v14 +; GFX6-NEXT: v_xor_b32_e32 v14, s6, v14 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v31 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v15, v17, v15 +; GFX6-NEXT: v_xor_b32_e32 v15, s6, v15 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -891,93 +888,92 @@ ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 -; GFX8-NEXT: v_bfrev_b32_e32 v17, 1 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v3, v17, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v4, v17, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v5, v21 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v5, v17, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, s6, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v6, v22 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v6, v17, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, s6, v6 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v7, v23 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v7, v17, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v8, v24 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v8, v17, v8 +; GFX8-NEXT: v_xor_b32_e32 v8, s6, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v9, v25 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v9, v17, v9 +; GFX8-NEXT: v_xor_b32_e32 v9, s6, v9 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v10, v26 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v10, v17, v10 +; GFX8-NEXT: v_xor_b32_e32 v10, s6, v10 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v11, v27 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v11, v17, v11 +; GFX8-NEXT: v_xor_b32_e32 v11, s6, v11 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v12, v28 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v12, v17, v12 +; GFX8-NEXT: v_xor_b32_e32 v12, s6, v12 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v13, v29 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 ; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v13, v17, v13 +; GFX8-NEXT: v_xor_b32_e32 v13, s6, v13 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v14, v30 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v14, v17, v14 +; GFX8-NEXT: v_xor_b32_e32 v14, s6, v14 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v31 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v15, v17, v15 +; GFX8-NEXT: v_xor_b32_e32 v15, s6, v15 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -37,8 +37,8 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: s_addk_i32 s32, 0x2800{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: s_addk_i32 s32, 0x2800{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen @@ -58,8 +58,8 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: s_addk_i32 s32, 0x3000{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: s_addk_i32 s32, 0x3000{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -49,8 +49,7 @@ ; GFX9-O0-NEXT: s_mov_b32 s35, 1 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s35, v3 ; GFX9-O0-NEXT: s_mov_b32 s35, 2 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s35 -; GFX9-O0-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX9-O0-NEXT: v_and_b32_e64 v3, v3, s35 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload @@ -217,8 +216,7 @@ ; GFX9-O0-NEXT: s_mov_b32 s34, 1 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0 ; GFX9-O0-NEXT: s_mov_b32 s34, 2 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 -; GFX9-O0-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s34 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1