Index: llvm/include/llvm/CodeGen/TargetRegisterInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -528,6 +528,13 @@ getMatchingSuperRegClass(const TargetRegisterClass *A, const TargetRegisterClass *B, unsigned Idx) const; + /// Check if the registers defined by the pair (RegisterClass, SubReg) + /// share the same register file. + bool shareSameRegisterFile(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const; + // For a copy-like instruction that defines a register of class DefRC with // subreg index DefSubReg, reading from another source with class SrcRC and // subregister SrcSubReg return true if this is a preferable copy Index: llvm/lib/CodeGen/DetectDeadLanes.cpp =================================================================== --- llvm/lib/CodeGen/DetectDeadLanes.cpp +++ llvm/lib/CodeGen/DetectDeadLanes.cpp @@ -179,15 +179,7 @@ } } - unsigned PreA, PreB; // Unused. - if (SrcSubIdx && DstSubIdx) - return !TRI.getCommonSuperRegClass(SrcRC, SrcSubIdx, DstRC, DstSubIdx, PreA, - PreB); - if (SrcSubIdx) - return !TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSubIdx); - if (DstSubIdx) - return !TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSubIdx); - return !TRI.getCommonSubClass(SrcRC, DstRC); + return !TRI.shareSameRegisterFile(DstRC, DstSubIdx, SrcRC, SrcSubIdx); } void DetectDeadLanes::addUsedLanesOnOperand(const MachineOperand &MO, Index: llvm/lib/CodeGen/MachineLICM.cpp =================================================================== --- llvm/lib/CodeGen/MachineLICM.cpp +++ llvm/lib/CodeGen/MachineLICM.cpp @@ -1154,9 +1154,30 @@ return false; } +static bool isCrossCopy(const MachineInstr &MI) { + if (!MI.isCopy()) + return false; + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + + Register DstReg = MI.getOperand(0).getReg(); + unsigned DstSubIdx = MI.getOperand(0).getSubReg(); + Register SrcReg = MI.getOperand(1).getReg(); + unsigned SrcSubIdx = MI.getOperand(1).getSubReg(); + + if (!SrcReg.isVirtual() || !DstReg.isVirtual()) + return false; + + const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + return !TRI.shareSameRegisterFile(DstRC, DstSubIdx, SrcRC, SrcSubIdx); +} + /// Return true if the instruction is marked "cheap" or the operand latency /// between its def and a use is one or less. bool MachineLICMBase::IsCheapInstruction(MachineInstr &MI) const { + if (isCrossCopy(MI)) + return false; if (TII->isAsCheapAsAMove(MI) || MI.isCopyLike()) return true; Index: llvm/lib/CodeGen/TargetRegisterInfo.cpp =================================================================== --- llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -359,11 +359,10 @@ /// Check if the registers defined by the pair (RegisterClass, SubReg) /// share the same register file. -static bool shareSameRegisterFile(const TargetRegisterInfo &TRI, - const TargetRegisterClass *DefRC, - unsigned DefSubReg, - const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) { +bool TargetRegisterInfo::shareSameRegisterFile(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { // Same register class. if (DefRC == SrcRC) return true; @@ -371,8 +370,8 @@ // Both operands are sub registers. Check if they share a register class. unsigned SrcIdx, DefIdx; if (SrcSubReg && DefSubReg) { - return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg, - SrcIdx, DefIdx) != nullptr; + return getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg, SrcIdx, + DefIdx) != nullptr; } // At most one of the register is a sub register, make it Src to avoid @@ -384,10 +383,10 @@ // One of the register is a sub register, check if we can get a superclass. if (SrcSubReg) - return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr; + return getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr; // Plain copy. - return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr; + return getCommonSubClass(DefRC, SrcRC) != nullptr; } bool TargetRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC, @@ -395,7 +394,7 @@ const TargetRegisterClass *SrcRC, unsigned SrcSubReg) const { // If this source does not incur a cross register bank copy, use it. - return shareSameRegisterFile(*this, DefRC, DefSubReg, SrcRC, SrcSubReg); + return shareSameRegisterFile(DefRC, DefSubReg, SrcRC, SrcSubReg); } // Compute target-independent register allocator hints to help eliminate copies. Index: llvm/test/CodeGen/AMDGPU/idiv-licm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -265,31 +265,31 @@ ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s2, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_movk_i32 s3, 0x400 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: BB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v8, v1 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_and_b32_e32 v1, s2, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v1 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[1:2] +; GFX9-NEXT: v_add_u16_e32 v5, 1, v5 +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], s4, v6 +; GFX9-NEXT: v_mul_f32_e32 v1, v8, v3 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v1 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v4, v7, s[0:1] +; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v8 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, v0 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, v9, s[0:1] ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: global_store_short v[6:7], v1, off ; GFX9-NEXT: s_cbranch_vccz BB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -316,33 +316,33 @@ ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_movk_i32 s6, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s2, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: BB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v7, v8, v1 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v7, v7, s3 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v4 +; GFX9-NEXT: v_and_b32_e32 v1, s2, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v1 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[1:2] +; GFX9-NEXT: v_add_u16_e32 v5, 1, v5 +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], s4, v6 +; GFX9-NEXT: v_mul_f32_e32 v9, v8, v3 +; GFX9-NEXT: v_trunc_f32_e32 v9, v9 +; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v4, v7, s[0:1] +; GFX9-NEXT: v_mad_f32 v8, -v9, v0, v8 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v8, v8, s3 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7 -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: v_sub_u32_e32 v1, v1, v8 +; GFX9-NEXT: global_store_short v[6:7], v1, off ; GFX9-NEXT: s_cbranch_vccz BB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -368,36 +368,36 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_movk_i32 s3, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: BB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_bfe_i32 v5, v4, 0, 16 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v5 -; GFX9-NEXT: v_xor_b32_e32 v8, s2, v5 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v7, v9, v1 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v8 -; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v7 -; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v9 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v0| -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v2, v8, v2 +; GFX9-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v6 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX9-NEXT: v_xor_b32_e32 v8, s2, v6 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[1:2] +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v8 +; GFX9-NEXT: v_mul_f32_e32 v8, v9, v3 +; GFX9-NEXT: v_trunc_f32_e32 v8, v8 +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], s4, v6 +; GFX9-NEXT: v_cvt_i32_f32_e32 v10, v8 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v4, v7, s[0:1] +; GFX9-NEXT: v_mad_f32 v8, -v8, v0, v9 +; GFX9-NEXT: v_add_u16_e32 v5, 1, v5 +; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, |v0| +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, v10, v1 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: global_store_short v[6:7], v1, off ; GFX9-NEXT: s_cbranch_vccz BB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -423,38 +423,38 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_movk_i32 s3, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: BB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_bfe_i32 v7, v4, 0, 16 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v7 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v8, v10, v1 -; GFX9-NEXT: v_xor_b32_e32 v9, s2, v7 -; GFX9-NEXT: v_trunc_f32_e32 v8, v8 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v9 -; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v8 -; GFX9-NEXT: v_mad_f32 v8, -v8, v0, v10 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, |v0| -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 +; GFX9-NEXT: v_bfe_i32 v8, v5, 0, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v8 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX9-NEXT: v_xor_b32_e32 v9, s2, v8 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[1:2] +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v9 +; GFX9-NEXT: v_mul_f32_e32 v9, v10, v3 +; GFX9-NEXT: v_trunc_f32_e32 v9, v9 +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], s4, v6 +; GFX9-NEXT: v_cvt_i32_f32_e32 v11, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v4, v7, s[0:1] +; GFX9-NEXT: v_mad_f32 v9, -v9, v0, v10 +; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v9|, |v0| +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, v11, v1 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 +; GFX9-NEXT: v_add_u16_e32 v5, 1, v5 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, v7, v2 -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: v_sub_u32_e32 v1, v8, v1 +; GFX9-NEXT: global_store_short v[6:7], v1, off ; GFX9-NEXT: s_cbranch_vccz BB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/sdiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -200,40 +200,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB0_6 ; GCN-IR-NEXT: BB0_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[10:11], v4 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[10:11], v4 ; GCN-IR-NEXT: s_add_u32 s10, s6, -1 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: s_addc_u32 s11, s7, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s11 ; GCN-IR-NEXT: BB0_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s11 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s10, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s6, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s7, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s10, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s6, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s7, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: BB0_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1072,40 +1072,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB9_6 ; GCN-IR-NEXT: BB9_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[10:11], v4 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[10:11], v4 ; GCN-IR-NEXT: s_add_u32 s10, s8, -1 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: s_addc_u32 s11, s9, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s11 ; GCN-IR-NEXT: BB9_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s11 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s10, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s8, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s9, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s10, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s8, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s9, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB9_5 ; GCN-IR-NEXT: BB9_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1299,38 +1299,39 @@ ; GCN-IR-NEXT: s_branch BB10_6 ; GCN-IR-NEXT: BB10_4: ; %udiv-preheader ; GCN-IR-NEXT: s_add_u32 s7, s8, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], 24, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 58, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], 24, v3 ; GCN-IR-NEXT: s_addc_u32 s10, s9, -1 -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, 58, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s10 ; GCN-IR-NEXT: BB10_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s10 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s7, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s8, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s9, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s7, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s8, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s9, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB10_5 ; GCN-IR-NEXT: BB10_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1835,30 +1836,31 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v8, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, 31 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[9:10], v[9:10], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v4 -; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_lshr_b32_e32 v5, v4, v0 +; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v5 ; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[3:4], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, s12, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, s12, v9 ; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v10, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 +; GCN-IR-NEXT: v_ashr_i32_e32 v11, v5, v0 +; GCN-IR-NEXT: v_and_b32_e32 v13, 0x8000, v11 +; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v11 ; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, 1, v7 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v8, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[11:12], v[7:8] -; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v9 -; GCN-IR-NEXT: v_and_b32_e32 v9, 0x8000, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v0, v9 -; GCN-IR-NEXT: v_subb_u32_e64 v10, s[4:5], v10, v13, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v13 +; GCN-IR-NEXT: v_subb_u32_e64 v10, s[4:5], v10, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] Index: llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -540,10 +540,7 @@ ; GCN: s_mov_b32 s{{[0-9]+}}, 0xffff ; GCN: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; -; TODO: Why is the constant not peepholed into the v_or_b32_e32? -; -; NOSDWA: s_mov_b32 [[CONST:s[0-9]+]], 0x10000 -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, s0, +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000, ; SDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000, define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { bb: Index: llvm/test/CodeGen/AMDGPU/srem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/srem64.ll +++ llvm/test/CodeGen/AMDGPU/srem64.ll @@ -173,40 +173,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB0_6 ; GCN-IR-NEXT: BB0_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 ; GCN-IR-NEXT: s_add_u32 s8, s2, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[6:7], v4 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: s_addc_u32 s9, s3, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[6:7], v4 +; GCN-IR-NEXT: s_addc_u32 s9, s3, -1 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s9 ; GCN-IR-NEXT: BB0_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s9 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s8, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s2, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s3, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s8, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s2, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s3, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: BB0_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1080,40 +1080,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB8_6 ; GCN-IR-NEXT: BB8_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 ; GCN-IR-NEXT: s_add_u32 s6, s10, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[8:9], v4 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: s_addc_u32 s7, s11, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[8:9], v4 +; GCN-IR-NEXT: s_addc_u32 s7, s11, -1 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s7 ; GCN-IR-NEXT: BB8_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s7 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s6, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s10, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s11, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s6, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s10, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s11, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB8_5 ; GCN-IR-NEXT: BB8_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1251,40 +1251,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB9_6 ; GCN-IR-NEXT: BB9_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 ; GCN-IR-NEXT: s_add_u32 s10, s8, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[6:7], v4 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: s_addc_u32 s11, s9, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[6:7], v4 +; GCN-IR-NEXT: s_addc_u32 s11, s9, -1 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s11 ; GCN-IR-NEXT: BB9_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s11 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s10, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s8, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s9, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s10, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s8, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s9, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB9_5 ; GCN-IR-NEXT: BB9_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1480,38 +1480,39 @@ ; GCN-IR-NEXT: s_branch BB10_6 ; GCN-IR-NEXT: BB10_4: ; %udiv-preheader ; GCN-IR-NEXT: s_add_u32 s7, s2, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], 24, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 58, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], 24, v3 ; GCN-IR-NEXT: s_addc_u32 s8, s3, -1 -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, 58, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 ; GCN-IR-NEXT: BB10_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s7, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s2, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s3, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s7, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s2, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s3, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB10_5 ; GCN-IR-NEXT: BB10_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -2012,37 +2013,38 @@ ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v9 +; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_lshr_b64 v[11:12], v[0:1], v9 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 0xffffffcf, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 31 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 +; GCN-IR-NEXT: v_lshl_b64 v[11:12], v[11:12], 1 +; GCN-IR-NEXT: v_lshr_b32_e32 v6, v5, v10 +; GCN-IR-NEXT: v_or_b32_e32 v11, v11, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, s12, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, 0, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v14, 0x8000, v12 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, s12, v11 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, 0, v12, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v13, v4 +; GCN-IR-NEXT: v_ashr_i32_e32 v13, v6, v10 +; GCN-IR-NEXT: v_and_b32_e32 v15, 0x8000, v13 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v13 +; GCN-IR-NEXT: v_add_i32_e32 v13, vcc, 1, v8 +; GCN-IR-NEXT: v_or_b32_e32 v5, v14, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v14, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[13:14], v[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 -; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v14 -; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v15, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v14, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 +; GCN-IR-NEXT: v_sub_i32_e64 v11, s[4:5], v11, v15 +; GCN-IR-NEXT: v_subb_u32_e64 v12, s[4:5], v12, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz BB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow Index: llvm/test/CodeGen/AMDGPU/udiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udiv64.ll +++ llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -174,40 +174,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB0_6 ; GCN-IR-NEXT: BB0_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[6:7], v4 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[6:7], v4 ; GCN-IR-NEXT: s_add_u32 s6, s2, -1 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: s_addc_u32 s7, s3, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s7 ; GCN-IR-NEXT: BB0_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s7 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s6, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s2, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s3, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s6, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s2, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s3, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: BB0_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -885,40 +885,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB7_6 ; GCN-IR-NEXT: BB7_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[6:7], v4 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[6:7], v4 ; GCN-IR-NEXT: s_add_u32 s6, s2, -1 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: s_addc_u32 s7, s3, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s7 ; GCN-IR-NEXT: BB7_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s7 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s6, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s2, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s3, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s6, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s2, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s3, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB7_5 ; GCN-IR-NEXT: BB7_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1089,38 +1089,39 @@ ; GCN-IR-NEXT: s_branch BB8_6 ; GCN-IR-NEXT: BB8_4: ; %udiv-preheader ; GCN-IR-NEXT: s_add_u32 s3, s6, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], 24, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 58, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], 24, v3 ; GCN-IR-NEXT: s_addc_u32 s8, s7, -1 -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, 58, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 ; GCN-IR-NEXT: BB8_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s3, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s6, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s7, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s3, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s6, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s7, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB8_5 ; GCN-IR-NEXT: BB8_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1377,30 +1378,31 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 31 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v6, v7, v4 +; GCN-IR-NEXT: v_lshr_b32_e32 v4, v3, v6 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v7 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v8, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v2 +; GCN-IR-NEXT: v_ashr_i32_e32 v9, v4, v6 +; GCN-IR-NEXT: v_and_b32_e32 v11, 0x8000, v9 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v9 ; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v7, 31, v4 ; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1] -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v7 -; GCN-IR-NEXT: v_and_b32_e32 v7, 0x8000, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v6, v7 -; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v11, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v11 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v12, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v9, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] @@ -1581,6 +1583,7 @@ ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 23, v6 ; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 @@ -1595,7 +1598,7 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[0:1], 0, v7, s[0:1] +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v3, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB11_5 @@ -1753,25 +1756,26 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 31 ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v6, v7, v4 +; GCN-IR-NEXT: v_lshr_b32_e32 v4, v3, v6 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 23, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 23, v7 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v8, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v2 +; GCN-IR-NEXT: v_ashr_i32_e32 v9, v4, v6 +; GCN-IR-NEXT: v_and_b32_e32 v11, 24, v9 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v9 ; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v7, 31, v4 ; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1] -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v7 -; GCN-IR-NEXT: v_and_b32_e32 v7, 24, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v6, v7 +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 ; GCN-IR-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5] Index: llvm/test/CodeGen/AMDGPU/urem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/urem64.ll +++ llvm/test/CodeGen/AMDGPU/urem64.ll @@ -173,40 +173,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB0_6 ; GCN-IR-NEXT: BB0_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 ; GCN-IR-NEXT: s_add_u32 s8, s2, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[6:7], v4 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: s_addc_u32 s9, s3, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[6:7], v4 +; GCN-IR-NEXT: s_addc_u32 s9, s3, -1 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s9 ; GCN-IR-NEXT: BB0_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s9 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s8, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s2, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s3, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s8, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s2, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s3, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: BB0_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -897,38 +897,39 @@ ; GCN-IR-NEXT: s_branch BB6_6 ; GCN-IR-NEXT: BB6_4: ; %udiv-preheader ; GCN-IR-NEXT: s_add_u32 s3, s6, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], 24, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 58, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], 24, v3 ; GCN-IR-NEXT: s_addc_u32 s8, s7, -1 -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, 58, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 ; GCN-IR-NEXT: BB6_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s3, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s6, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s7, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s3, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s6, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s7, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB6_5 ; GCN-IR-NEXT: BB6_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1111,6 +1112,7 @@ ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 23, v6 ; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 @@ -1125,7 +1127,7 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[0:1], 0, v7, s[0:1] +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v3, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB7_5 @@ -1394,37 +1396,38 @@ ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v7 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_lshr_b64 v[9:10], v[0:1], v7 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v6 ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 31 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 +; GCN-IR-NEXT: v_lshl_b64 v[9:10], v[9:10], 1 +; GCN-IR-NEXT: v_lshr_b32_e32 v4, v3, v8 +; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_and_b32_e32 v12, 0x8000, v10 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v9 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v10, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v11, v2 +; GCN-IR-NEXT: v_ashr_i32_e32 v11, v4, v8 +; GCN-IR-NEXT: v_and_b32_e32 v13, 0x8000, v11 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v11 +; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, 1, v6 +; GCN-IR-NEXT: v_or_b32_e32 v3, v12, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[11:12], v[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v12 -; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v13, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v13 +; GCN-IR-NEXT: v_subb_u32_e64 v10, s[4:5], v10, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow