Index: llvm/lib/CodeGen/MachineLoopInfo.cpp =================================================================== --- llvm/lib/CodeGen/MachineLoopInfo.cpp +++ llvm/lib/CodeGen/MachineLoopInfo.cpp @@ -18,6 +18,7 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Config/llvm-config.h" #include "llvm/InitializePasses.h" @@ -154,7 +155,9 @@ bool MachineLoop::isLoopInvariant(MachineInstr &I) const { MachineFunction *MF = I.getParent()->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + const TargetSubtargetInfo &ST = MF->getSubtarget(); + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetInstrInfo *TII = ST.getInstrInfo(); // The instruction is loop invariant if all of its operands are. for (const MachineOperand &MO : I.operands()) { @@ -174,7 +177,8 @@ // However, if the physreg is known to always be caller saved/restored // then this use is safe to hoist. if (!MRI->isConstantPhysReg(Reg) && - !(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF()))) + !(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())) && + !TII->isIgnorableUse(MO)) return false; // Otherwise it's safe to move. continue; Index: llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -120,25 +120,25 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: BB5_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: v_not_b32_e32 v0, v1 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_or_b32_e32 v0, -2, v0 -; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: v_not_b32_e32 v1, v3 +; CHECK-NEXT: v_or_b32_e32 v2, -2, v1 +; CHECK-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz BB5_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3] +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, 12, s[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -330,24 +330,24 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: BB14_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_add_f32_e32 v0, 1.0, v1 -; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3 +; CHECK-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz BB14_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -365,24 +365,24 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: BB15_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_add_f32_e32 v0, -1.0, v1 -; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3 +; CHECK-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz BB15_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3] ; CHECK-NEXT: global_store_dword v[0:1], v2, off Index: llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -416,13 +416,13 @@ ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: BB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -464,13 +464,13 @@ ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: BB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -626,13 +626,13 @@ ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: BB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc Index: llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -9,108 +9,108 @@ ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v1, s4 ; GFX900-NEXT: BB0_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX900-NEXT: s_cbranch_execnz BB0_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v0, off +; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_f32: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GFX908-NEXT: v_mov_b32_e32 v1, s4 ; GFX908-NEXT: BB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, 0 -; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX908-NEXT: s_cbranch_execnz BB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v0, off +; GFX908-NEXT: global_store_dword v[0:1], v1, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: BB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execnz BB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: global_store_dword v[0:1], v1, off ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_fadd_ret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: BB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execnz BB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef @@ -122,52 +122,52 @@ ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v1, s4 ; GFX900-NEXT: BB1_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX900-NEXT: s_cbranch_execnz BB1_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v0, off +; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GFX908-NEXT: v_mov_b32_e32 v1, s4 ; GFX908-NEXT: BB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, 0 -; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX908-NEXT: s_cbranch_execnz BB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v0, off +; GFX908-NEXT: global_store_dword v[0:1], v1, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee: @@ -185,29 +185,29 @@ ; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: BB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execnz BB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef @@ -219,13 +219,13 @@ ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, s4 ; GFX900-NEXT: BB2_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc @@ -264,6 +264,7 @@ ; GFX10-LABEL: global_atomic_fadd_noret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -271,7 +272,6 @@ ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: BB2_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -295,13 +295,13 @@ ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, s4 ; GFX900-NEXT: BB3_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc @@ -340,6 +340,7 @@ ; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -347,7 +348,6 @@ ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: BB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -371,52 +371,52 @@ ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v1, s4 ; GFX900-NEXT: BB4_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX900-NEXT: s_cbranch_execnz BB4_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v0, off +; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GFX908-NEXT: v_mov_b32_e32 v1, s4 ; GFX908-NEXT: BB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, 0 -; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX908-NEXT: s_cbranch_execnz BB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v0, off +; GFX908-NEXT: global_store_dword v[0:1], v1, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent: @@ -434,29 +434,29 @@ ; GFX10-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: BB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execnz BB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef @@ -468,108 +468,108 @@ ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v1, s4 ; GFX900-NEXT: BB5_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX900-NEXT: s_cbranch_execnz BB5_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v0, off +; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_f32_system: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GFX908-NEXT: v_mov_b32_e32 v1, s4 ; GFX908-NEXT: BB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, 0 -; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX908-NEXT: s_cbranch_execnz BB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v0, off +; GFX908-NEXT: global_store_dword v[0:1], v1, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_system: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: BB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execnz BB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: global_store_dword v[0:1], v1, off ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_fadd_ret_f32_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: BB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execnz BB5_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst store float %result, float addrspace(1)* undef @@ -581,26 +581,26 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[2:3], 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: BB6_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_add_f32_e32 v1, 4.0, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GCN-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN-NEXT: s_cbranch_execnz BB6_1 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: global_store_dword v[0:1], v1, off ; GCN-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef @@ -627,13 +627,13 @@ ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, s4 ; GFX900-NEXT: BB8_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc @@ -651,13 +651,13 @@ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s4 ; GFX908-NEXT: BB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v2, 0 ; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc @@ -675,13 +675,13 @@ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: BB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc @@ -698,6 +698,7 @@ ; GFX10-LABEL: global_atomic_fadd_noret_f32_safe: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -705,7 +706,6 @@ ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: BB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -729,6 +729,7 @@ ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -738,7 +739,6 @@ ; GFX900-NEXT: BB9_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -774,6 +774,7 @@ ; GFX10-LABEL: infer_as_before_atomic: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -784,7 +785,6 @@ ; GFX10-NEXT: BB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 Index: llvm/test/CodeGen/AMDGPU/indirect-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -314,12 +314,12 @@ ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] ; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] ; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: v_mov_b32_e32 v2, 0x7b ; GCN-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GCN-NEXT: v_mov_b32_e32 v0, 0x7b ; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -327,9 +327,11 @@ ; GCN-NEXT: s_mov_b32 s12, s44 ; GCN-NEXT: s_mov_b32 s13, s43 ; GCN-NEXT: s_mov_b32 s14, s42 +; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr2 ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB3_1 ; GCN-NEXT: ; %bb.2: Index: llvm/test/CodeGen/AMDGPU/infinite-loop.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -147,6 +147,7 @@ ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: BB3_2: ; %outer_loop ; SI-NEXT: ; =>This Loop Header: Depth=1 ; SI-NEXT: ; Child Loop BB3_3 Depth 2 @@ -156,8 +157,6 @@ ; SI-NEXT: ; => This Inner Loop Header: Depth=2 ; SI-NEXT: s_and_b64 s[8:9], exec, s[0:1] ; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -61,43 +61,44 @@ ; GFX9-NEXT: s_cbranch_execz BB1_3 ; GFX9-NEXT: ; %bb.1: ; %bb19 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v6 -; GFX9-NEXT: v_add_u32_e32 v6, v4, v0 -; GFX9-NEXT: v_lshl_add_u32 v3, v6, 2, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 2, v2 -; GFX9-NEXT: v_add_u32_e32 v7, v17, v12 +; GFX9-NEXT: v_lshl_add_u32 v6, v4, 2, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v2 +; GFX9-NEXT: v_add_u32_e32 v9, v17, v12 ; GFX9-NEXT: s_mov_b64 s[10:11], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3727c5ac +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: BB1_2: ; %bb23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 -; GFX9-NEXT: v_add_u32_e32 v9, v17, v0 -; GFX9-NEXT: v_add_u32_e32 v12, v7, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v12, v17, v0 +; GFX9-NEXT: v_add_u32_e32 v19, v9, v0 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_madak_f32 v8, v8, v4, 0x3727c5ac -; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX9-NEXT: v_mul_u32_u24_e32 v18, v8, v5 -; GFX9-NEXT: v_add_u32_e32 v8, v8, v16 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v8, v13 -; GFX9-NEXT: v_mul_lo_u32 v8, v8, v15 -; GFX9-NEXT: v_sub_u32_e32 v19, v9, v18 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v19, v14 -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v16 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, v13 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v15 ; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18 -; GFX9-NEXT: v_add_u32_e32 v8, v12, v8 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: v_sub_u32_e32 v18, v19, v18 +; GFX9-NEXT: v_add_u32_e32 v3, v18, v3 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 2, v[8:9] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[6:7], v10, v8 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v11, v9, s[6:7] -; GFX9-NEXT: global_load_dword v8, v[8:9], off +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4] +; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18 +; GFX9-NEXT: v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7] +; GFX9-NEXT: global_load_dword v3, v[18:19], off ; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 ; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] -; GFX9-NEXT: ds_write_b32 v3, v8 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] +; GFX9-NEXT: ds_write_b32 v6, v3 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_cbranch_execnz BB1_2 ; GFX9-NEXT: BB1_3: ; %Flow3 Index: llvm/test/CodeGen/AMDGPU/sdiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -383,23 +383,23 @@ ; GCN-IR-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v0, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v0, v4 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v1 -; GCN-IR-NEXT: v_subb_u32_e32 v10, vcc, v1, v4, vcc +; GCN-IR-NEXT: v_subb_u32_e32 v12, vcc, v1, v4, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v2 ; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v1, v5 ; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v3 ; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v0, v5, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[9:10] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[11:12] ; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v2 ; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v3 ; GCN-IR-NEXT: v_min_u32_e32 v0, v0, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v9 +; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v11 ; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 32, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v10 +; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v12 ; GCN-IR-NEXT: v_min_u32_e32 v13, v7, v8 ; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v0, v13 ; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc @@ -410,59 +410,59 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 -; GCN-IR-NEXT: v_cndmask_b32_e64 v12, v10, 0, s[6:7] +; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v12, 0, s[6:7] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v14, v17 -; GCN-IR-NEXT: v_cndmask_b32_e64 v11, v9, 0, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v16, v17 +; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v15, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[15:16], v[7:8] +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v7 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v8, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[14:15], v[7:8] ; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], 63, v7 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[9:10], v7 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[11:12], v7 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v19, vcc, -1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v20, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v0, v0 -; GCN-IR-NEXT: v_lshr_b64 v[15:16], v[9:10], v15 -; GCN-IR-NEXT: v_not_b32_e32 v10, v17 -; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, v0, v13 +; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[11:12], v14 +; GCN-IR-NEXT: v_not_b32_e32 v9, v17 +; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v0, v13 +; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v16, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v18, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[13:14], v[15:16], 1 +; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v0, v13, v0 +; GCN-IR-NEXT: v_or_b32_e32 v0, v14, v0 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v19, v0 -; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v20, v14, vcc -; GCN-IR-NEXT: v_or_b32_e32 v7, v17, v7 -; GCN-IR-NEXT: v_add_i32_e32 v17, vcc, 1, v9 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v11 -; GCN-IR-NEXT: v_or_b32_e32 v8, v18, v8 -; GCN-IR-NEXT: v_addc_u32_e32 v18, vcc, 0, v10, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[17:18], v[9:10] -; GCN-IR-NEXT: v_mov_b32_e32 v9, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_and_b32_e32 v11, 1, v13 -; GCN-IR-NEXT: v_and_b32_e32 v16, v13, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v18, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v19, v15, vcc +; GCN-IR-NEXT: v_or_b32_e32 v7, v16, v7 +; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v11 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v9 +; GCN-IR-NEXT: v_or_b32_e32 v8, v17, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v12, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[11:12] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v16 +; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v13 +; GCN-IR-NEXT: v_and_b32_e32 v20, v13, v3 ; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v15, s[4:5], v0, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v10, v18 -; GCN-IR-NEXT: v_mov_b32_e32 v18, v12 -; GCN-IR-NEXT: v_subb_u32_e64 v16, s[4:5], v14, v16, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v0, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v17 +; GCN-IR-NEXT: v_mov_b32_e32 v17, v10 +; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v20, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v17, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v16, v9 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -470,14 +470,14 @@ ; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[7:8], 1 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v3 -; GCN-IR-NEXT: v_or_b32_e32 v11, v11, v2 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v3 +; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v2 ; GCN-IR-NEXT: BB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v4 -; GCN-IR-NEXT: v_xor_b32_e32 v3, v11, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v3, v9, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v6 -; GCN-IR-NEXT: v_xor_b32_e32 v2, v12, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v2, v10, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1476,8 +1476,9 @@ ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 @@ -1496,7 +1497,6 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v17 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 @@ -1647,21 +1647,21 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v6, v4, v5 +; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v6, s8 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v9 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_6 @@ -1670,9 +1670,9 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5] ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] @@ -1681,36 +1681,36 @@ ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v10 +; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v8 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v14, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v8 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 ; GCN-IR-NEXT: v_and_b32_e32 v17, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v16, v12, v1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v6 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v6, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1718,12 +1718,12 @@ ; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v1 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 ; GCN-IR-NEXT: BB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_xor_b32_e32 v0, v8, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v9, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1785,6 +1785,7 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v8, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1803,7 +1804,6 @@ ; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v9 ; GCN-IR-NEXT: v_and_b32_e32 v9, 0x8000, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 Index: llvm/test/CodeGen/AMDGPU/srem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/srem64.ll +++ llvm/test/CodeGen/AMDGPU/srem64.ll @@ -411,8 +411,9 @@ ; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v3, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 ; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v14 -; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v13, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 @@ -428,7 +429,6 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v12, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[11:12] ; GCN-IR-NEXT: v_mov_b32_e32 v11, v16 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v13 ; GCN-IR-NEXT: v_and_b32_e32 v20, v13, v6 ; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v5 @@ -1647,8 +1647,9 @@ ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 @@ -1667,7 +1668,6 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 @@ -1817,20 +1817,20 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v4, v2, v3 +; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v4 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v6 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_mov_b32_e32 v6, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v5, v7 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_6 @@ -1839,9 +1839,9 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] @@ -1850,36 +1850,36 @@ ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 ; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1887,14 +1887,14 @@ ; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v3 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 +; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 +; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 ; GCN-IR-NEXT: BB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v7 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v6 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v6 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 @@ -1960,6 +1960,7 @@ ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 0xffffffcf, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1978,7 +1979,6 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 Index: llvm/test/CodeGen/AMDGPU/udiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udiv64.ll +++ llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -377,8 +377,9 @@ ; GCN-IR-NEXT: v_not_b32_e32 v1, v9 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[12:13], 1 @@ -397,7 +398,6 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 @@ -1215,30 +1215,30 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v4, v2, v3 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffd0, v4 -; GCN-IR-NEXT: v_addc_u32_e64 v7, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v7 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v6 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] @@ -1247,36 +1247,36 @@ ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 ; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1284,8 +1284,8 @@ ; GCN-IR-NEXT: BB9_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v0 +; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v0 ; GCN-IR-NEXT: BB9_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v2 @@ -1340,6 +1340,7 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1358,7 +1359,6 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 @@ -1708,8 +1708,9 @@ ; GCN-IR-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 @@ -1727,7 +1728,6 @@ ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v7 ; GCN-IR-NEXT: v_and_b32_e32 v7, 24, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v6, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 Index: llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -186,14 +186,15 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v40, s33, 4 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v40, s34, 0 ; GCN-NEXT: v_writelane_b32 v40, s35, 1 ; GCN-NEXT: v_writelane_b32 v40, s36, 2 @@ -202,8 +203,9 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: flat_load_dword v43, v[1:2] ; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: flat_load_dword v43, v[1:2] +; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000 ; GCN-NEXT: s_getpc_b64 s[36:37] ; GCN-NEXT: s_add_u32 s36, s36, spam@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s37, s37, spam@rel32@hi+12 @@ -214,13 +216,11 @@ ; GCN-NEXT: BB1_1: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: BB1_2: ; %bb18 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: BB1_3: ; %bb2 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB1_4 Depth 2 @@ -229,8 +229,7 @@ ; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: flat_load_dword v0, v[41:42] -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc @@ -268,8 +267,7 @@ ; GCN-NEXT: ; %bb.9: ; %bb16 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: BB1_10: ; %bb17 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 Index: llvm/test/CodeGen/AMDGPU/urem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/urem64.ll +++ llvm/test/CodeGen/AMDGPU/urem64.ll @@ -379,40 +379,40 @@ ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 -; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 ; GCN-IR-NEXT: v_not_b32_e32 v7, v9 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v7, v11, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[12:13], 1 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v8 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v14 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 -; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 -; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 -; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v11, v13, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v13, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v17, v10, v2 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v16, v10, v3 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v8 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v10 +; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v17 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1239,19 +1239,19 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v4, v2, v3 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v4 +; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v6 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_mov_b32_e32 v6, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v5, v7 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB8_6 @@ -1260,9 +1260,9 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] @@ -1271,36 +1271,36 @@ ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: BB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 ; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB8_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1308,14 +1308,14 @@ ; GCN-IR-NEXT: BB8_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v3 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 +; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 +; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 ; GCN-IR-NEXT: BB8_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v7 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v6 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v6 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 @@ -1370,6 +1370,7 @@ ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v6 ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1388,7 +1389,6 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 Index: llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -6,15 +6,15 @@ ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 +; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: BB0_1: ; %bb0 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 -; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 -; GCN-NEXT: s_mov_b32 s5, exec_lo -; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] ; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) Index: llvm/test/CodeGen/AMDGPU/wave32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wave32.ll +++ llvm/test/CodeGen/AMDGPU/wave32.ll @@ -250,9 +250,9 @@ ; GCN: s_cbranch_execz ; GCN: BB{{.*}}: -; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], exec_lo -; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], exec -; GCN: global_load_dword [[LOAD:v[0-9]+]] +; GFX1032-DAG: s_or_b32 [[MASK1]], [[MASK1]], exec_lo +; GFX1064-DAG: s_or_b64 [[MASK1]], [[MASK1]], exec +; GCN-DAG: global_load_dword [[LOAD:v[0-9]+]] ; GFX1032: v_cmp_gt_i32_e32 vcc_lo, 11, [[LOAD]] ; GFX1064: v_cmp_gt_i32_e32 vcc, 11, [[LOAD]] define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 {