Index: llvm/lib/CodeGen/MachineLICM.cpp =================================================================== --- llvm/lib/CodeGen/MachineLICM.cpp +++ llvm/lib/CodeGen/MachineLICM.cpp @@ -1154,9 +1154,32 @@ return false; } +static bool isCrossCopy(const MachineInstr &MI) { + if (!MI.isCopy()) + return false; + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + + Register DstReg = MI.getOperand(0).getReg(); + unsigned DstSubIdx = MI.getOperand(0).getSubReg(); + Register SrcReg = MI.getOperand(1).getReg(); + unsigned SrcSubIdx = MI.getOperand(1).getSubReg(); + + if (!SrcReg.isVirtual() || !DstReg.isVirtual()) + return false; + + const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + // We use shouldRewriteCopySrc that internally calls shareSameRegisterFile + // (but can be overridden by the target). + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + return !TRI.shouldRewriteCopySrc(DstRC, DstSubIdx, SrcRC, SrcSubIdx); +} + /// Return true if the instruction is marked "cheap" or the operand latency /// between its def and a use is one or less. bool MachineLICMBase::IsCheapInstruction(MachineInstr &MI) const { + if (isCrossCopy(MI)) + return false; if (TII->isAsCheapAsAMove(MI) || MI.isCopyLike()) return true; Index: llvm/test/CodeGen/AMDGPU/idiv-licm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -265,31 +265,31 @@ ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s2, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NEXT: s_movk_i32 s3, 0x400 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: BB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v8, v1 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_and_b32_e32 v1, s2, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v1 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[1:2] +; GFX9-NEXT: v_add_u16_e32 v5, 1, v5 +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], s4, v6 +; GFX9-NEXT: v_mul_f32_e32 v1, v8, v3 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v1 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v4, v7, s[0:1] +; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v8 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, v0 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, v9, s[0:1] ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: global_store_short v[6:7], v1, off ; GFX9-NEXT: s_cbranch_vccz BB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -316,33 +316,33 @@ ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_movk_i32 s6, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s2, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: s_movk_i32 s5, 0x400 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: BB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v7, v8, v1 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v7, v7, s3 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v4 +; GFX9-NEXT: v_and_b32_e32 v1, s2, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v1 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[1:2] +; GFX9-NEXT: v_add_u16_e32 v5, 1, v5 +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], s4, v6 +; GFX9-NEXT: v_mul_f32_e32 v9, v8, v3 +; GFX9-NEXT: v_trunc_f32_e32 v9, v9 +; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v4, v7, s[0:1] +; GFX9-NEXT: v_mad_f32 v8, -v9, v0, v8 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v8, v8, s3 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7 -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: v_sub_u32_e32 v1, v1, v8 +; GFX9-NEXT: global_store_short v[6:7], v1, off ; GFX9-NEXT: s_cbranch_vccz BB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -368,36 +368,36 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_movk_i32 s3, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: BB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_bfe_i32 v5, v4, 0, 16 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v5 -; GFX9-NEXT: v_xor_b32_e32 v8, s2, v5 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v7, v9, v1 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v8 -; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v7 -; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v9 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v0| -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v2, v8, v2 +; GFX9-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v6 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX9-NEXT: v_xor_b32_e32 v8, s2, v6 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[1:2] +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v8 +; GFX9-NEXT: v_mul_f32_e32 v8, v9, v3 +; GFX9-NEXT: v_trunc_f32_e32 v8, v8 +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], s4, v6 +; GFX9-NEXT: v_cvt_i32_f32_e32 v10, v8 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v4, v7, s[0:1] +; GFX9-NEXT: v_mad_f32 v8, -v8, v0, v9 +; GFX9-NEXT: v_add_u16_e32 v5, 1, v5 +; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, |v0| +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, v10, v1 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: global_store_short v[6:7], v1, off ; GFX9-NEXT: s_cbranch_vccz BB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -423,38 +423,38 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_movk_i32 s3, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: BB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_bfe_i32 v7, v4, 0, 16 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v7 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v8, v10, v1 -; GFX9-NEXT: v_xor_b32_e32 v9, s2, v7 -; GFX9-NEXT: v_trunc_f32_e32 v8, v8 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v9 -; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v8 -; GFX9-NEXT: v_mad_f32 v8, -v8, v0, v10 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, |v0| -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 +; GFX9-NEXT: v_bfe_i32 v8, v5, 0, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v8 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX9-NEXT: v_xor_b32_e32 v9, s2, v8 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[1:2] +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v9 +; GFX9-NEXT: v_mul_f32_e32 v9, v10, v3 +; GFX9-NEXT: v_trunc_f32_e32 v9, v9 +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], s4, v6 +; GFX9-NEXT: v_cvt_i32_f32_e32 v11, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v4, v7, s[0:1] +; GFX9-NEXT: v_mad_f32 v9, -v9, v0, v10 +; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v9|, |v0| +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, v11, v1 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 +; GFX9-NEXT: v_add_u16_e32 v5, 1, v5 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, v7, v2 -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: v_sub_u32_e32 v1, v8, v1 +; GFX9-NEXT: global_store_short v[6:7], v1, off ; GFX9-NEXT: s_cbranch_vccz BB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/sdiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -200,40 +200,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB0_6 ; GCN-IR-NEXT: BB0_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[10:11], v4 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[10:11], v4 ; GCN-IR-NEXT: s_add_u32 s10, s6, -1 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: s_addc_u32 s11, s7, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s11 ; GCN-IR-NEXT: BB0_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s11 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s10, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s6, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s7, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s10, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s6, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s7, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: BB0_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1072,40 +1072,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB9_6 ; GCN-IR-NEXT: BB9_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[10:11], v4 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[10:11], v4 ; GCN-IR-NEXT: s_add_u32 s10, s8, -1 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: s_addc_u32 s11, s9, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s11 ; GCN-IR-NEXT: BB9_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s11 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s10, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s8, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s9, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s10, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s8, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s9, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB9_5 ; GCN-IR-NEXT: BB9_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1299,38 +1299,39 @@ ; GCN-IR-NEXT: s_branch BB10_6 ; GCN-IR-NEXT: BB10_4: ; %udiv-preheader ; GCN-IR-NEXT: s_add_u32 s7, s8, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], 24, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 58, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], 24, v3 ; GCN-IR-NEXT: s_addc_u32 s10, s9, -1 -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, 58, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s10 ; GCN-IR-NEXT: BB10_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s10 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s7, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s8, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s9, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s7, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s8, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s9, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB10_5 ; GCN-IR-NEXT: BB10_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1835,30 +1836,31 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v8, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, 31 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[9:10], v[9:10], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v4 -; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_lshr_b32_e32 v5, v4, v0 +; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v5 ; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[3:4], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, s12, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, s12, v9 ; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v10, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 +; GCN-IR-NEXT: v_ashr_i32_e32 v11, v5, v0 +; GCN-IR-NEXT: v_and_b32_e32 v13, 0x8000, v11 +; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v11 ; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, 1, v7 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v8, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[11:12], v[7:8] -; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v9 -; GCN-IR-NEXT: v_and_b32_e32 v9, 0x8000, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v0, v9 -; GCN-IR-NEXT: v_subb_u32_e64 v10, s[4:5], v10, v13, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v13 +; GCN-IR-NEXT: v_subb_u32_e64 v10, s[4:5], v10, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] Index: llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -540,10 +540,7 @@ ; GCN: s_mov_b32 s{{[0-9]+}}, 0xffff ; GCN: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; -; TODO: Why is the constant not peepholed into the v_or_b32_e32? -; -; NOSDWA: s_mov_b32 [[CONST:s[0-9]+]], 0x10000 -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, s0, +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000, ; SDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000, define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { bb: Index: llvm/test/CodeGen/AMDGPU/srem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/srem64.ll +++ llvm/test/CodeGen/AMDGPU/srem64.ll @@ -173,40 +173,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB0_6 ; GCN-IR-NEXT: BB0_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 ; GCN-IR-NEXT: s_add_u32 s8, s2, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[6:7], v4 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: s_addc_u32 s9, s3, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[6:7], v4 +; GCN-IR-NEXT: s_addc_u32 s9, s3, -1 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s9 ; GCN-IR-NEXT: BB0_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s9 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s8, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s2, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s3, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s8, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s2, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s3, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: BB0_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1080,40 +1080,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB8_6 ; GCN-IR-NEXT: BB8_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 ; GCN-IR-NEXT: s_add_u32 s6, s10, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[8:9], v4 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: s_addc_u32 s7, s11, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[8:9], v4 +; GCN-IR-NEXT: s_addc_u32 s7, s11, -1 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s7 ; GCN-IR-NEXT: BB8_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s7 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s6, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s10, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s11, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s6, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s10, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s11, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB8_5 ; GCN-IR-NEXT: BB8_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1251,40 +1251,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB9_6 ; GCN-IR-NEXT: BB9_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 ; GCN-IR-NEXT: s_add_u32 s10, s8, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[6:7], v4 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: s_addc_u32 s11, s9, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[6:7], v4 +; GCN-IR-NEXT: s_addc_u32 s11, s9, -1 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s11 ; GCN-IR-NEXT: BB9_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s11 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s10, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s8, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s9, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s10, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s8, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s9, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB9_5 ; GCN-IR-NEXT: BB9_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1480,38 +1480,39 @@ ; GCN-IR-NEXT: s_branch BB10_6 ; GCN-IR-NEXT: BB10_4: ; %udiv-preheader ; GCN-IR-NEXT: s_add_u32 s7, s2, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], 24, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 58, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], 24, v3 ; GCN-IR-NEXT: s_addc_u32 s8, s3, -1 -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, 58, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 ; GCN-IR-NEXT: BB10_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s7, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s2, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s3, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s7, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s2, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s3, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB10_5 ; GCN-IR-NEXT: BB10_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -2012,37 +2013,38 @@ ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v9 +; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_lshr_b64 v[11:12], v[0:1], v9 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 0xffffffcf, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 31 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 +; GCN-IR-NEXT: v_lshl_b64 v[11:12], v[11:12], 1 +; GCN-IR-NEXT: v_lshr_b32_e32 v6, v5, v10 +; GCN-IR-NEXT: v_or_b32_e32 v11, v11, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, s12, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, 0, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v14, 0x8000, v12 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, s12, v11 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, 0, v12, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v13, v4 +; GCN-IR-NEXT: v_ashr_i32_e32 v13, v6, v10 +; GCN-IR-NEXT: v_and_b32_e32 v15, 0x8000, v13 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v13 +; GCN-IR-NEXT: v_add_i32_e32 v13, vcc, 1, v8 +; GCN-IR-NEXT: v_or_b32_e32 v5, v14, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v14, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[13:14], v[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 -; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v14 -; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v15, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v14, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 +; GCN-IR-NEXT: v_sub_i32_e64 v11, s[4:5], v11, v15 +; GCN-IR-NEXT: v_subb_u32_e64 v12, s[4:5], v12, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz BB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow Index: llvm/test/CodeGen/AMDGPU/udiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udiv64.ll +++ llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -174,40 +174,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB0_6 ; GCN-IR-NEXT: BB0_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[6:7], v4 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[6:7], v4 ; GCN-IR-NEXT: s_add_u32 s6, s2, -1 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: s_addc_u32 s7, s3, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s7 ; GCN-IR-NEXT: BB0_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s7 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s6, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s2, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s3, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s6, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s2, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s3, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: BB0_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -885,40 +885,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB7_6 ; GCN-IR-NEXT: BB7_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[6:7], v4 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[6:7], v4 ; GCN-IR-NEXT: s_add_u32 s6, s2, -1 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: s_addc_u32 s7, s3, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s7 ; GCN-IR-NEXT: BB7_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s7 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s6, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s2, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s3, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s6, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s2, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s3, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB7_5 ; GCN-IR-NEXT: BB7_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1089,38 +1089,39 @@ ; GCN-IR-NEXT: s_branch BB8_6 ; GCN-IR-NEXT: BB8_4: ; %udiv-preheader ; GCN-IR-NEXT: s_add_u32 s3, s6, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], 24, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 58, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], 24, v3 ; GCN-IR-NEXT: s_addc_u32 s8, s7, -1 -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, 58, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 ; GCN-IR-NEXT: BB8_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s3, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s6, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s7, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s3, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s6, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s7, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB8_5 ; GCN-IR-NEXT: BB8_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1377,30 +1378,31 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 31 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v6, v7, v4 +; GCN-IR-NEXT: v_lshr_b32_e32 v4, v3, v6 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v7 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v8, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v2 +; GCN-IR-NEXT: v_ashr_i32_e32 v9, v4, v6 +; GCN-IR-NEXT: v_and_b32_e32 v11, 0x8000, v9 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v9 ; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v7, 31, v4 ; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1] -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v7 -; GCN-IR-NEXT: v_and_b32_e32 v7, 0x8000, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v6, v7 -; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v11, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v11 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v12, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v9, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] @@ -1581,22 +1583,23 @@ ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 23, v6 ; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v10, 24, v8 ; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v8, 24, v8 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GCN-IR-NEXT: v_add_i32_e64 v8, s[0:1], 1, v4 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e64 v9, s[0:1], 0, v5, s[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[8:9], v[4:5] +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v3, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB11_5 ; GCN-IR-NEXT: BB11_6: ; %udiv-loop-exit @@ -1753,25 +1756,26 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 31 ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 +; GCN-IR-NEXT: v_lshr_b32_e32 v4, v3, v6 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v7, v4 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 23, v6 -; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 23, v7 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v2 +; GCN-IR-NEXT: v_ashr_i32_e32 v9, v4, v6 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v9, 24, v9 +; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v7, v9 ; GCN-IR-NEXT: v_add_i32_e64 v9, s[4:5], 1, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v7, 31, v4 ; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v3 ; GCN-IR-NEXT: v_addc_u32_e64 v10, s[4:5], 0, v1, s[4:5] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], v[9:10], v[0:1] -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v7 -; GCN-IR-NEXT: v_and_b32_e32 v7, 24, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v6, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 ; GCN-IR-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc Index: llvm/test/CodeGen/AMDGPU/urem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/urem64.ll +++ llvm/test/CodeGen/AMDGPU/urem64.ll @@ -173,40 +173,40 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_branch BB0_6 ; GCN-IR-NEXT: BB0_4: ; %udiv-preheader -; GCN-IR-NEXT: v_not_b32_e32 v2, v2 ; GCN-IR-NEXT: s_add_u32 s8, s2, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[6:7], v4 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: s_addc_u32 s9, s3, -1 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_not_b32_e32 v2, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], s[6:7], v4 +; GCN-IR-NEXT: s_addc_u32 s9, s3, -1 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[0:1], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s9 ; GCN-IR-NEXT: BB0_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s9 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s8, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s2, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s3, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s8, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s2, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s3, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: BB0_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -897,38 +897,39 @@ ; GCN-IR-NEXT: s_branch BB6_6 ; GCN-IR-NEXT: BB6_4: ; %udiv-preheader ; GCN-IR-NEXT: s_add_u32 s3, s6, -1 -; GCN-IR-NEXT: v_lshr_b64 v[6:7], 24, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 58, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_lshr_b64 v[7:8], 24, v3 ; GCN-IR-NEXT: s_addc_u32 s8, s7, -1 -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, 58, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[0:1], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 ; GCN-IR-NEXT: BB6_5: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s3, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s6, v8 -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s7, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s3, v7 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v4, v8, vcc +; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v11, s6, v9 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v12, s7, v9 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_or_b32_e32 v1, v10, v1 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[0:1], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v3 +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v12, s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB6_5 ; GCN-IR-NEXT: BB6_6: ; %udiv-loop-exit ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1111,22 +1112,23 @@ ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 23, v6 ; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v10, 24, v8 ; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v8, 24, v8 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GCN-IR-NEXT: v_add_i32_e64 v8, s[0:1], 1, v4 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e64 v9, s[0:1], 0, v5, s[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[8:9], v[4:5] +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[0:1], v7, v3, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB7_5 ; GCN-IR-NEXT: BB7_6: ; %udiv-loop-exit @@ -1394,37 +1396,38 @@ ; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v7 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_lshr_b64 v[9:10], v[0:1], v7 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v6 ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 31 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 +; GCN-IR-NEXT: v_lshl_b64 v[9:10], v[9:10], 1 +; GCN-IR-NEXT: v_lshr_b32_e32 v4, v3, v8 +; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_and_b32_e32 v12, 0x8000, v10 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v9 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v10, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v11, v2 +; GCN-IR-NEXT: v_ashr_i32_e32 v11, v4, v8 +; GCN-IR-NEXT: v_and_b32_e32 v13, 0x8000, v11 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v11 +; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, 1, v6 +; GCN-IR-NEXT: v_or_b32_e32 v3, v12, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[11:12], v[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v12 -; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v13, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v5 +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v13 +; GCN-IR-NEXT: v_subb_u32_e64 v10, s[4:5], v10, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow Index: llvm/test/CodeGen/Thumb2/mve-floatregloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-floatregloops.ll +++ llvm/test/CodeGen/Thumb2/mve-floatregloops.ll @@ -7,13 +7,13 @@ ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.f32 q1, q1, r3 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vadd.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -52,13 +52,13 @@ ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.f32 q1, q1, r3 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vadd.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -97,13 +97,13 @@ ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmul.f32 q1, q1, r3 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vmul.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -142,13 +142,13 @@ ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmul.f32 q1, q1, r3 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vmul.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -187,13 +187,13 @@ ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vsub.f32 q1, q1, r3 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vsub.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -279,14 +279,14 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfmas.f32 q2, q1, r12 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vfmas.f32 q1, q0, r12 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -329,14 +329,14 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfmas.f32 q2, q1, r12 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vfmas.f32 q1, q0, r12 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -379,14 +379,14 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfma.f32 q2, q1, r12 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vfma.f32 q1, q0, r12 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -429,15 +429,15 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 -; CHECK-NEXT: vdup.32 q1, r12 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfma.f32 q3, q1, q2 -; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: vfma.f32 q2, q0, q1 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: bne .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -481,16 +481,17 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: vneg.f32 q0, q0 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vdup.32 q1, r12 -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 -; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfma.f32 q1, q3, q2 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vfma.f32 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 ; CHECK-NEXT: bne .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -585,15 +586,15 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: .LBB12_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vneg.f32 q1, q1 -; CHECK-NEXT: vfma.f32 q1, q2, r12 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: vfma.f32 q0, q1, r12 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: bne .LBB12_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -636,16 +637,16 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: .LBB13_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vldrw.u32 q3, [r0], #16 -; CHECK-NEXT: vdup.32 q1, r12 -; CHECK-NEXT: vneg.f32 q2, q2 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfma.f32 q2, q1, q3 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q0, q2 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB13_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -781,13 +782,13 @@ ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: ldrh r4, [r0] -; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: ldr r5, [r0, #4] ; CHECK-NEXT: subs r7, r4, #1 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhi .LBB15_6 ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: ldr r6, [r0, #8] -; CHECK-NEXT: add.w r11, r12, r7, lsl #2 +; CHECK-NEXT: add.w r11, r5, r7, lsl #2 ; CHECK-NEXT: lsr.w lr, r3, #2 ; CHECK-NEXT: vldr s0, [r6] ; CHECK-NEXT: vldr s2, [r6, #4] @@ -795,41 +796,43 @@ ; CHECK-NEXT: vldr s6, [r6, #12] ; CHECK-NEXT: wls lr, lr, .LBB15_5 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph -; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: bic r4, r3, #3 +; CHECK-NEXT: strd r3, r4, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: vmov r10, s4 +; CHECK-NEXT: vmov r12, s6 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: vmov r8, s0 +; CHECK-NEXT: add.w r3, r2, r3, lsl #2 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r4, r2, r4, lsl #2 -; CHECK-NEXT: str r4, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: .LBB15_3: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r7, r1, r6 -; CHECK-NEXT: add.w r5, r12, r6 -; CHECK-NEXT: vldrw.u32 q2, [r7] +; CHECK-NEXT: add.w r9, r1, r6 ; CHECK-NEXT: add.w r7, r11, r6 -; CHECK-NEXT: vmov r10, s0 +; CHECK-NEXT: vldrw.u32 q2, [r9] +; CHECK-NEXT: adds r5, r3, r6 ; CHECK-NEXT: vstrw.32 q2, [r7] -; CHECK-NEXT: vmov r9, s2 -; CHECK-NEXT: vldrw.u32 q2, [r5] -; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: adds r7, r2, r6 +; CHECK-NEXT: vldrw.u32 q2, [r5] ; CHECK-NEXT: adds r6, #16 -; CHECK-NEXT: vmul.f32 q2, q2, r10 +; CHECK-NEXT: vmul.f32 q2, q2, r8 ; CHECK-NEXT: vldrw.u32 q3, [r5, #4] -; CHECK-NEXT: vmov r8, s6 -; CHECK-NEXT: vfma.f32 q2, q3, r9 +; CHECK-NEXT: vfma.f32 q2, q3, r4 ; CHECK-NEXT: vldrw.u32 q3, [r5, #8] ; CHECK-NEXT: vldrw.u32 q4, [r5, #12] -; CHECK-NEXT: vfma.f32 q2, q3, r4 -; CHECK-NEXT: vfma.f32 q2, q4, r8 +; CHECK-NEXT: vfma.f32 q2, q3, r10 +; CHECK-NEXT: vfma.f32 q2, q4, r12 ; CHECK-NEXT: vstrw.32 q2, [r7] ; CHECK-NEXT: le lr, .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: add r4, sp, #4 ; CHECK-NEXT: add r11, r6 -; CHECK-NEXT: add.w r12, r12, r2, lsl #2 +; CHECK-NEXT: add.w r5, r3, r2, lsl #2 ; CHECK-NEXT: add.w r1, r1, r2, lsl #2 -; CHECK-NEXT: ldrd r2, r4, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: ldm r4, {r2, r3, r4} @ 12-byte Folded Reload ; CHECK-NEXT: .LBB15_5: @ %while.end ; CHECK-NEXT: and r7, r3, #3 ; CHECK-NEXT: vldrw.u32 q2, [r1] @@ -837,36 +840,37 @@ ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q2, [r11] ; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vldrw.u32 q0, [r12] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vldrw.u32 q0, [r5] +; CHECK-NEXT: vmov r12, s6 ; CHECK-NEXT: vmov r7, s4 -; CHECK-NEXT: vmul.f32 q0, q0, r5 -; CHECK-NEXT: vldrw.u32 q1, [r12, #4] +; CHECK-NEXT: vmul.f32 q0, q0, r1 +; CHECK-NEXT: vldrw.u32 q1, [r5, #4] ; CHECK-NEXT: vfma.f32 q0, q1, r6 -; CHECK-NEXT: vldrw.u32 q1, [r12, #8] +; CHECK-NEXT: vldrw.u32 q1, [r5, #8] ; CHECK-NEXT: vfma.f32 q0, q1, r7 -; CHECK-NEXT: vldrw.u32 q1, [r12, #12] -; CHECK-NEXT: vfma.f32 q0, q1, r1 +; CHECK-NEXT: vldrw.u32 q1, [r5, #12] +; CHECK-NEXT: vfma.f32 q0, q1, r12 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r2] -; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: ldr r5, [r0, #4] ; CHECK-NEXT: .LBB15_6: @ %if.end -; CHECK-NEXT: add.w r0, r12, r3, lsl #2 +; CHECK-NEXT: add.w r0, r5, r3, lsl #2 +; CHECK-NEXT: mov r2, r5 ; CHECK-NEXT: lsr.w lr, r4, #2 ; CHECK-NEXT: wls lr, lr, .LBB15_10 ; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader -; CHECK-NEXT: bic r2, r4, #3 -; CHECK-NEXT: adds r1, r2, r3 -; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: add.w r1, r12, r1, lsl #2 +; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: adds r1, r7, r3 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: .LBB15_8: @ %while.body51 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vstrb.8 q0, [r3], #16 ; CHECK-NEXT: le lr, .LBB15_8 ; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit -; CHECK-NEXT: add.w r12, r12, r2, lsl #2 +; CHECK-NEXT: add.w r2, r2, r7, lsl #2 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: .LBB15_10: @ %while.end55 ; CHECK-NEXT: ands r1, r4, #3 @@ -875,7 +879,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r12] +; CHECK-NEXT: vstrwt.32 q0, [r2] ; CHECK-NEXT: .LBB15_12: @ %if.end61 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} Index: llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll @@ -187,20 +187,19 @@ ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #2 +; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcmp.f32 ge, q2, r1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.f32 le, q2, r1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpt.f32 ge, q1, r1 +; CHECK-NEXT: vcmpt.f32 le, q1, r2 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q1, [r0], #16 +; CHECK-NEXT: vstrwt.32 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -249,18 +248,18 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: vneg.f16 s2, s0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #3 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vldrh.u16 q2, [r0] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vpt.f16 ge, q2, r2 -; CHECK-NEXT: vcmpt.f16 le, q2, r1 +; CHECK-NEXT: vldrh.u16 q1, [r0] +; CHECK-NEXT: vpt.f16 ge, q1, r2 +; CHECK-NEXT: vcmpt.f16 le, q1, r1 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q1, [r0], #16 +; CHECK-NEXT: vstrht.16 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -487,20 +486,19 @@ ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #2 +; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcmp.f32 ge, q2, r1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.f32 le, q2, r1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpt.f32 ge, q1, r1 +; CHECK-NEXT: vcmpt.f32 le, q1, r2 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q1, [r0], #16 +; CHECK-NEXT: vstrwt.32 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -549,18 +547,18 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: vneg.f16 s2, s0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #3 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vldrh.u16 q2, [r0] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vpt.f16 ge, q2, r2 -; CHECK-NEXT: vcmpt.f16 le, q2, r1 +; CHECK-NEXT: vldrh.u16 q1, [r0] +; CHECK-NEXT: vpt.f16 ge, q1, r2 +; CHECK-NEXT: vcmpt.f16 le, q1, r1 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q1, [r0], #16 +; CHECK-NEXT: vstrht.16 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc}