diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1887,6 +1887,10 @@ case AMDGPU::V_SET_INACTIVE_B32: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + // FIXME: We may possibly optimize the COPY once we find ways to make LLVM + // optimizations (mainly Register Coalescer) aware of WWM register liveness. + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) + .add(MI.getOperand(1)); auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) @@ -1899,11 +1903,15 @@ case AMDGPU::V_SET_INACTIVE_B64: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); - FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), MI.getOperand(0).getReg()) - .add(MI.getOperand(2)); + .add(MI.getOperand(1)); + expandPostRAPseudo(*Copy); + auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); + FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten + Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), + MI.getOperand(0).getReg()) + .add(MI.getOperand(2)); expandPostRAPseudo(*Copy); BuildMI(MBB, MI, DL, get(NotOpc), Exec) .addReg(Exec); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -180,15 +180,13 @@ // restoring it after we're done. let Defs = [SCC] in { def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), - (ins VGPR_32: $src, VSrc_b32:$inactive), + (ins VSrc_b32: $src, VSrc_b32:$inactive), [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { - let Constraints = "$src = $vdst"; } def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), - (ins VReg_64: $src, VSrc_b64:$inactive), + (ins VSrc_b64: $src, VSrc_b64:$inactive), [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { - let Constraints = "$src = $vdst"; } } // End Defs = [SCC] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -44,14 +44,14 @@ ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0 +; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s3, 56 +; GCN-NEXT: s_cmp_lg_u32 s2, 56 ; GCN-NEXT: s_cselect_b32 s2, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -363,12 +363,12 @@ ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -388,7 +388,7 @@ ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB2_2 @@ -417,12 +417,12 @@ ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -442,7 +442,7 @@ ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 @@ -1448,12 +1448,12 @@ ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -1473,7 +1473,7 @@ ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB8_2 @@ -1502,12 +1502,12 @@ ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -1527,7 +1527,7 @@ ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -360,12 +360,12 @@ ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -385,7 +385,7 @@ ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB2_2 @@ -411,12 +411,12 @@ ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -436,7 +436,7 @@ ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 @@ -587,9 +587,9 @@ ; ; GFX8-LABEL: add_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -608,7 +608,7 @@ ; GFX8-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8-NEXT: s_mov_b64 exec, s[0:1] ; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB3_2 ; GFX8-NEXT: ; %bb.1: @@ -623,9 +623,9 @@ ; ; GFX9-LABEL: add_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -644,7 +644,7 @@ ; GFX9-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_2 ; GFX9-NEXT: ; %bb.1: @@ -1525,12 +1525,12 @@ ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -1550,7 +1550,7 @@ ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB9_2 @@ -1576,12 +1576,12 @@ ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -1601,7 +1601,7 @@ ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_2 @@ -1752,9 +1752,9 @@ ; ; GFX8-LABEL: sub_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -1773,7 +1773,7 @@ ; GFX8-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8-NEXT: s_mov_b64 exec, s[0:1] ; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB10_2 ; GFX8-NEXT: ; %bb.1: @@ -1788,9 +1788,9 @@ ; ; GFX9-LABEL: sub_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -1809,7 +1809,7 @@ ; GFX9-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: @@ -2364,12 +2364,12 @@ ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, -1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, -1 ; GFX8-NEXT: s_not_b64 exec, exec @@ -2389,7 +2389,7 @@ ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB14_2 @@ -2415,12 +2415,12 @@ ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, -1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, -1 ; GFX9-NEXT: s_not_b64 exec, exec @@ -2440,7 +2440,7 @@ ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB14_2 @@ -2598,12 +2598,12 @@ ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -2623,7 +2623,7 @@ ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB15_2 @@ -2649,12 +2649,12 @@ ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -2674,7 +2674,7 @@ ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB15_2 @@ -2832,12 +2832,12 @@ ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -2857,7 +2857,7 @@ ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB16_2 @@ -2883,12 +2883,12 @@ ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -2908,7 +2908,7 @@ ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_2 @@ -3066,12 +3066,12 @@ ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, v1 ; GFX8-NEXT: s_not_b64 exec, exec @@ -3091,7 +3091,7 @@ ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB17_2 @@ -3117,12 +3117,12 @@ ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: s_not_b64 exec, exec @@ -3142,7 +3142,7 @@ ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB17_2 @@ -3166,10 +3166,10 @@ ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064-NEXT: s_not_b64 exec, exec @@ -3229,10 +3229,10 @@ ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo @@ -3483,12 +3483,12 @@ ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, v1 ; GFX8-NEXT: s_not_b64 exec, exec @@ -3508,7 +3508,7 @@ ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB19_2 @@ -3534,12 +3534,12 @@ ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: s_not_b64 exec, exec @@ -3559,7 +3559,7 @@ ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB19_2 @@ -3583,10 +3583,10 @@ ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064-NEXT: s_not_b64 exec, exec @@ -3646,10 +3646,10 @@ ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo @@ -3900,12 +3900,12 @@ ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -3925,7 +3925,7 @@ ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB21_2 @@ -3951,12 +3951,12 @@ ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -3976,7 +3976,7 @@ ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB21_2 @@ -4310,12 +4310,12 @@ ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, -1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, -1 ; GFX8-NEXT: s_not_b64 exec, exec @@ -4335,7 +4335,7 @@ ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB23_2 @@ -4361,12 +4361,12 @@ ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, -1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, -1 ; GFX9-NEXT: s_not_b64 exec, exec @@ -4386,7 +4386,7 @@ ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -179,16 +179,16 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 +; GFX8-NEXT: ; implicit-def: $vgpr3 ; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX8-NEXT: s_cbranch_execz .LBB1_4 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[10:11] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -208,7 +208,7 @@ ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[10:11] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_3 @@ -220,14 +220,14 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v0 ; GFX8-NEXT: .LBB1_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_wqm_b64 s[4:5], -1 ; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX8-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX8-NEXT: ; %bb.5: ; %if -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX8-NEXT: .LBB1_6: ; %UnifiedReturnBlock ; GFX8-NEXT: s_endpgm ; @@ -235,16 +235,16 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX9-NEXT: s_cbranch_execz .LBB1_4 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[10:11] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -264,7 +264,7 @@ ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[10:11] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_3 @@ -276,26 +276,26 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v3, s4, v0 ; GFX9-NEXT: .LBB1_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_wqm_b64 s[4:5], -1 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX9-NEXT: ; %bb.5: ; %if -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX9-NEXT: .LBB1_6: ; %UnifiedReturnBlock ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr4 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX1064-NEXT: s_cbranch_execz .LBB1_4 ; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec @@ -339,26 +339,26 @@ ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX1064-NEXT: v_add_nc_u32_e32 v4, s4, v0 ; GFX1064-NEXT: .LBB1_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 ; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX1064-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX1064-NEXT: ; %bb.5: ; %if -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[0:3], 0 ; GFX1064-NEXT: .LBB1_6: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr4 ; GFX1032-NEXT: s_mov_b32 s9, s8 -; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 ; GFX1032-NEXT: s_cbranch_execz .LBB1_4 ; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo @@ -393,14 +393,14 @@ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX1032-NEXT: v_add_nc_u32_e32 v4, s4, v0 ; GFX1032-NEXT: .LBB1_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 ; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX1032-NEXT: ; %bb.5: ; %if -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], 0 ; GFX1032-NEXT: .LBB1_6: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -28,9 +28,9 @@ ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -5,16 +5,17 @@ ; GCN-LABEL: if_then: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: ; %bb.1: ; %.bb0 -; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: ; %bb.2: ; %.merge ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %.then +; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo @@ -23,14 +24,14 @@ ; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s1 ; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v3, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen +; GCN-NEXT: v_mov_b32_e32 v4, -1 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: buffer_store_dword v4, v0, s[4:7], 0 offen ; GCN-NEXT: .LBB0_4: ; %.end ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_mov_b32_e32 v0, -1 -; GCN-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen ; GCN-NEXT: s_endpgm .entry: %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0 @@ -63,10 +64,10 @@ ; GCN-LABEL: if_else_vgpr_opt: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: ; %bb.1: ; %.bb0 -; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: ; %bb.2: ; %.merge ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0 @@ -77,6 +78,7 @@ ; GCN-NEXT: s_or_saveexec_b32 s1, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 exec_lo, s1 +; GCN-NEXT: v_mov_b32_e32 v2, v3 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo @@ -85,8 +87,8 @@ ; GCN-NEXT: s_mov_b32 exec_lo, s1 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: v_mov_b32_e32 v3, -1 -; GCN-NEXT: ; implicit-def: $vgpr2 ; GCN-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen +; GCN-NEXT: ; implicit-def: $vgpr3 ; GCN-NEXT: .LBB1_4: ; %Flow ; GCN-NEXT: s_or_saveexec_b32 s0, s0 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 @@ -94,7 +96,7 @@ ; GCN-NEXT: s_cbranch_execz .LBB1_6 ; GCN-NEXT: ; %bb.5: ; %.then ; GCN-NEXT: v_mov_b32_e32 v0, -1 -; GCN-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen +; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen ; GCN-NEXT: .LBB1_6: ; %.end ; GCN-NEXT: s_endpgm .entry: diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -691,9 +691,10 @@ ; GFX9-W64-LABEL: test_wwm_set_inactive1: ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_not_b64 exec, exec +; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -706,9 +707,10 @@ ; GFX10-W32-LABEL: test_wwm_set_inactive1: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 @@ -2315,7 +2317,8 @@ ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-W64-NEXT: s_not_b64 exec, exec @@ -2344,7 +2347,8 @@ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo @@ -2795,9 +2799,10 @@ ; GFX9-W64-LABEL: test_strict_wwm_set_inactive1: ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_not_b64 exec, exec +; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -2810,9 +2815,10 @@ ; GFX10-W32-LABEL: test_strict_wwm_set_inactive1: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 @@ -2848,7 +2854,8 @@ ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-W64-NEXT: s_not_b64 exec, exec @@ -2877,7 +2884,8 @@ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -66,18 +66,19 @@ ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec @@ -97,9 +98,7 @@ ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) @@ -233,50 +232,48 @@ ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc +; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-O3-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-O3-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: ; %bb.2: ; %merge +; GFX9-O3-NEXT: .LBB1_2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] @@ -355,8 +352,7 @@ ; GFX9-O0-NEXT: s_mov_b32 s39, s7 ; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_not_b64 exec, exec @@ -534,7 +530,7 @@ ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 -; GFX9-O0-NEXT: s_mov_b32 s34, s8 +; GFX9-O0-NEXT: s_mov_b32 s30, s8 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 @@ -544,18 +540,16 @@ ; GFX9-O0-NEXT: v_writelane_b32 v10, s37, 3 ; GFX9-O0-NEXT: v_writelane_b32 v10, s38, 4 ; GFX9-O0-NEXT: v_writelane_b32 v10, s39, 5 -; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 -; GFX9-O0-NEXT: s_mov_b32 s35, s9 -; GFX9-O0-NEXT: ; kill: def $sgpr30_sgpr31 killed $sgpr34_sgpr35 -; GFX9-O0-NEXT: s_mov_b64 s[30:31], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s35 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: ; kill: def $sgpr30 killed $sgpr30 def $sgpr30_sgpr31 +; GFX9-O0-NEXT: s_mov_b32 s31, s9 +; GFX9-O0-NEXT: ; kill: def $sgpr34_sgpr35 killed $sgpr30_sgpr31 +; GFX9-O0-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, s30 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s31 ; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35 +; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 6 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 7 @@ -638,14 +632,14 @@ ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: s_mov_b64 s[36:37], s[30:31] -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-O3-NEXT: s_getpc_b64 s[34:35] ; GFX9-O3-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 ; GFX9-O3-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-O3-NEXT: s_mov_b64 exec, s[30:31] +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 @@ -772,27 +766,34 @@ ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX9-O3-NEXT: buffer_load_dwordx4 v[1:4], v0, s[4:7], 0 offen -; GFX9-O3-NEXT: buffer_load_dwordx2 v[5:6], v0, s[4:7], 0 offen offset:16 +; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen +; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16 ; GFX9-O3-NEXT: s_mov_b32 s34, -1 ; GFX9-O3-NEXT: s_brev_b32 s35, -2 -; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v1, s34 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s35 ; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v3, s34 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, s35 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) +; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v5, s34 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s35 ; GFX9-O3-NEXT: s_not_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -106,8 +106,7 @@ define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) { ; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]] ; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}} -; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]] -; GFX9-O0-DAG: v_mov_b32_e32 v2, v0 +; GFX9-O0-DAG: v_mov_b32_e32 v2, [[ARG]] ; GFX9-O3: v_mov_b32_e32 v2, [[ARG]] @@ -142,10 +141,8 @@ ; GFX9: s_load_dwordx2 s{{\[}}[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]{{\]}} ; GFX9-O0: s_mov_b64 s{{\[}}[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]{{\]}}, 0{{$}} -; GFX9-O0: v_mov_b32_e32 v0, s[[ARG_LO]] -; GFX9-O0: v_mov_b32_e32 v1, s[[ARG_HI]] -; GFX9-O0-DAG: v_mov_b32_e32 v9, v1 -; GFX9-O0-DAG: v_mov_b32_e32 v8, v0 +; GFX9-O0-DAG: v_mov_b32_e32 v9, s[[ARG_HI]] +; GFX9-O0-DAG: v_mov_b32_e32 v8, s[[ARG_LO]] ; GFX9-O3-DAG: v_mov_b32_e32 v7, s[[ARG_HI]] ; GFX9-O3-DAG: v_mov_b32_e32 v6, s[[ARG_LO]] @@ -300,8 +297,7 @@ define amdgpu_kernel void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) { ; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]] ; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}} -; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]] -; GFX9-O0-DAG: v_mov_b32_e32 v2, v0 +; GFX9-O0-DAG: v_mov_b32_e32 v2, [[ARG]] ; GFX9-O3: v_mov_b32_e32 v2, [[ARG]] @@ -336,10 +332,8 @@ ; GFX9: s_load_dwordx2 s{{\[}}[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]{{\]}} ; GFX9-O0: s_mov_b64 s{{\[}}[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]{{\]}}, 0{{$}} -; GFX9-O0: v_mov_b32_e32 v0, s[[ARG_LO]] -; GFX9-O0: v_mov_b32_e32 v1, s[[ARG_HI]] -; GFX9-O0-DAG: v_mov_b32_e32 v9, v1 -; GFX9-O0-DAG: v_mov_b32_e32 v8, v0 +; GFX9-O0-DAG: v_mov_b32_e32 v9, s[[ARG_HI]] +; GFX9-O0-DAG: v_mov_b32_e32 v8, s[[ARG_LO]] ; GFX9-O3-DAG: v_mov_b32_e32 v7, s[[ARG_HI]] ; GFX9-O3-DAG: v_mov_b32_e32 v6, s[[ARG_LO]]