diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -130,7 +130,7 @@ } /// Given \p MO is a PhysReg use return if it can be ignored for the purpose - /// of instruction rematerialization. + /// of instruction rematerialization or sinking. virtual bool isIgnorableUse(const MachineOperand &MO) const { return false; } diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -796,9 +796,14 @@ if (Reg == 0) continue; - // Don't handle physical register. - if (Register::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) { + if (MO.isUse() && + (MRI->isConstantPhysReg(Reg) || TII->isIgnorableUse(MO))) + continue; + + // Don't handle non-constant and non-ignorable physical register. return false; + } // Users for the defs are all dominated by SuccToSinkTo. if (MO.isDef()) { @@ -898,7 +903,7 @@ // If the physreg has no defs anywhere, it's just an ambient register // and we can freely move its uses. Alternatively, if it's allocatable, // it could get allocated to something with a def during allocation. - if (!MRI->isConstantPhysReg(Reg)) + if (!MRI->isConstantPhysReg(Reg) && !TII->isIgnorableUse(MO)) return nullptr; } else if (!MO.isDead()) { // A def that isn't dead. We can't move it. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -2956,19 +2956,18 @@ ; CGP-LABEL: v_sdiv_v2i64_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 +; CGP-NEXT: s_mov_b64 s[6:7], 0x1000 ; CGP-NEXT: v_mov_b32_e32 v5, v2 ; CGP-NEXT: v_mov_b32_e32 v7, v3 -; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 +; CGP-NEXT: v_lshl_b64 v[2:3], s[6:7], v4 ; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_mov_b32_e32 v8, v0 ; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 @@ -2978,134 +2977,134 @@ ; CGP-NEXT: v_xor_b32_e32 v2, v2, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v6 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v6, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v10 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc ; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v3 ; CGP-NEXT: v_trunc_f32_e32 v9, v9 ; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v9 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v9 -; CGP-NEXT: v_mul_hi_u32 v17, v12, v3 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v3 -; CGP-NEXT: v_xor_b32_e32 v4, v4, v6 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v12, v3 +; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 +; CGP-NEXT: v_mul_hi_u32 v16, v11, v3 +; CGP-NEXT: v_mul_lo_u32 v15, v11, v3 +; CGP-NEXT: v_xor_b32_e32 v4, v4, v10 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v15 +; CGP-NEXT: v_mul_lo_u32 v16, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v17, v3, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v9, v15 +; CGP-NEXT: v_xor_b32_e32 v8, v8, v10 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v15, v9, v16 -; CGP-NEXT: v_mul_lo_u32 v17, v3, v14 -; CGP-NEXT: v_mul_hi_u32 v18, v3, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v9, v16 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v6 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v18, v9, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v17, v9, v13 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v16, v3, v13 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_mul_hi_u32 v17, v3, v14 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_mul_hi_u32 v14, v9, v14 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v14, v12, v9 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v3 -; CGP-NEXT: v_mul_hi_u32 v12, v12, v3 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v15 -; CGP-NEXT: v_mul_lo_u32 v14, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v16, v3, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v9, v15 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v9, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v3, v12 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v3 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v4, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v12, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v11, v9 +; CGP-NEXT: v_mul_lo_u32 v14, v11, v3 +; CGP-NEXT: v_mul_hi_u32 v11, v11, v3 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v14 +; CGP-NEXT: v_mul_lo_u32 v13, v3, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v3, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v9, v14 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v15, v9, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v9 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v13, v3, v11 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v12, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v13, v3 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v12, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v13, v1, v9 -; CGP-NEXT: v_mul_hi_u32 v15, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v14, v1, v3 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v8, v12, vcc -; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v12 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v2 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_mul_lo_u32 v11, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v12, v1, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v1, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v1, v3 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_subb_u32_e64 v12, s[4:5], v8, v11, vcc +; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v11 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v2 ; CGP-NEXT: v_subb_u32_e32 v8, vcc, v8, v2, vcc -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v1 ; CGP-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v2 -; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v9, vcc +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v2 +; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v3 +; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v13, s[4:5] +; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v14, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v12 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v13, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v2, v14, v4, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v13, v4, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v6, v0 +; CGP-NEXT: v_xor_b32_e32 v3, v10, v0 ; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 @@ -3114,8 +3113,9 @@ ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow2 -; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] +; CGP-NEXT: s_or_saveexec_b64 s[8:9], s[8:9] +; CGP-NEXT: v_lshl_b64 v[9:10], s[6:7], v6 +; CGP-NEXT: s_xor_b64 exec, exec, s[8:9] ; CGP-NEXT: s_cbranch_execz .LBB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 @@ -3139,8 +3139,8 @@ ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: .LBB8_4: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_or_b32_e32 v3, v7, v11 +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] +; CGP-NEXT: v_or_b32_e32 v3, v7, v10 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -3148,9 +3148,9 @@ ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_6 ; CGP-NEXT: ; %bb.5: -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v11 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v2 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v2, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v3, v2 ; CGP-NEXT: v_xor_b32_e32 v4, v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 @@ -3288,15 +3288,15 @@ ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc -; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: .LBB8_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] ; CGP-NEXT: s_cbranch_execz .LBB8_8 ; CGP-NEXT: ; %bb.7: -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -3304,15 +3304,15 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v10 +; CGP-NEXT: v_mul_lo_u32 v3, v2, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v10 +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v9 ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: .LBB8_8: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -2912,19 +2912,18 @@ ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 +; CGP-NEXT: s_mov_b64 s[6:7], 0x1000 ; CGP-NEXT: v_mov_b32_e32 v5, v2 ; CGP-NEXT: v_mov_b32_e32 v7, v3 -; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 +; CGP-NEXT: v_lshl_b64 v[2:3], s[6:7], v4 ; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_mov_b32_e32 v8, v0 ; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 @@ -2938,129 +2937,129 @@ ; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v4, vcc +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v4, vcc ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v2 -; CGP-NEXT: v_trunc_f32_e32 v8, v8 -; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v9 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v14, v9, v8 -; CGP-NEXT: v_mul_hi_u32 v16, v9, v2 -; CGP-NEXT: v_mul_lo_u32 v15, v9, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v1 +; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v0, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v11, v2 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v15, v10, v2 +; CGP-NEXT: v_mul_lo_u32 v14, v10, v2 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v2, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v2, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v9, v14 +; CGP-NEXT: v_xor_b32_e32 v8, v8, v4 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CGP-NEXT: v_mul_lo_u32 v14, v8, v15 -; CGP-NEXT: v_mul_lo_u32 v16, v2, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v2, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v8, v15 -; CGP-NEXT: v_xor_b32_e32 v6, v6, v4 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v17, v8, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v16, v9, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_mul_hi_u32 v15, v2, v12 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v16, v2, v13 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v8 -; CGP-NEXT: v_mul_lo_u32 v14, v9, v2 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v2 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; CGP-NEXT: v_mul_lo_u32 v12, v8, v14 -; CGP-NEXT: v_mul_lo_u32 v13, v2, v9 -; CGP-NEXT: v_mul_hi_u32 v15, v2, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v14 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v8, v9 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v2, v9 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v11, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v2 +; CGP-NEXT: v_mul_hi_u32 v10, v10, v2 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v9, v13 +; CGP-NEXT: v_mul_lo_u32 v12, v2, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v9, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v2, v10 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v6, v2 -; CGP-NEXT: v_mul_lo_u32 v12, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v6, v2 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v6, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; CGP-NEXT: v_mul_hi_u32 v12, v3, v8 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v9, v0, v2 -; CGP-NEXT: v_mul_lo_u32 v8, v1, v8 -; CGP-NEXT: v_mul_lo_u32 v12, v1, v2 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v8, v2 +; CGP-NEXT: v_mul_lo_u32 v11, v3, v9 +; CGP-NEXT: v_mul_hi_u32 v12, v3, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v8, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v3, v9 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v0, v2 +; CGP-NEXT: v_mul_lo_u32 v9, v1, v9 +; CGP-NEXT: v_mul_lo_u32 v11, v1, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v12 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v6, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v6, v2 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v11 +; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v8, v2, vcc +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v8, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v0 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v0 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v0 ; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc -; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v3, v1 -; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v0 +; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v3, v1 +; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v2, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v0 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v0 ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v10, v1 +; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 ; CGP-NEXT: v_xor_b32_e32 v2, v0, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 @@ -3068,7 +3067,8 @@ ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow2 -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] +; CGP-NEXT: v_lshl_b64 v[9:10], s[6:7], v6 ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_4 ; CGP-NEXT: ; %bb.3: @@ -3092,7 +3092,7 @@ ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_or_b32_e32 v3, v7, v11 +; CGP-NEXT: v_or_b32_e32 v3, v7, v10 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -3100,9 +3100,9 @@ ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_6 ; CGP-NEXT: ; %bb.5: -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v11 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v2 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v2, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v3, v2 ; CGP-NEXT: v_xor_b32_e32 v2, v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v3 @@ -3238,15 +3238,15 @@ ; CGP-NEXT: v_xor_b32_e32 v4, v2, v8 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v3, v8 ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v4, v8, vcc -; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: .LBB8_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_8 ; CGP-NEXT: ; %bb.7: -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -3254,13 +3254,13 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v2, v2, v10 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v9 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v9 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v9 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: .LBB8_8: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -1504,21 +1504,20 @@ ; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_mov_b32_e32 v5, v2 ; CGP-NEXT: v_mov_b32_e32 v7, v3 -; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 -; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 -; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 +; CGP-NEXT: s_mov_b64 s[6:7], 0x1000 +; CGP-NEXT: v_lshl_b64 v[2:3], s[6:7], v4 ; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 ; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1527,124 +1526,125 @@ ; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v1 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v6, v0 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v0 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_lo_u32 v14, v1, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v1, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CGP-NEXT: v_mul_lo_u32 v15, v0, v12 -; CGP-NEXT: v_mul_lo_u32 v17, v1, v12 -; CGP-NEXT: v_mul_hi_u32 v18, v0, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v1, v12 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v16 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v4, v1 ; CGP-NEXT: v_mul_lo_u32 v12, v4, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v6, v0 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v0 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v1 -; CGP-NEXT: v_mul_lo_u32 v14, v1, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v13, v1, v12 ; CGP-NEXT: v_mul_hi_u32 v15, v0, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v1, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_mul_lo_u32 v6, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v13, v1, v4 -; CGP-NEXT: v_mul_hi_u32 v16, v0, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v14, v0, v11 +; CGP-NEXT: v_mul_lo_u32 v16, v1, v11 +; CGP-NEXT: v_mul_hi_u32 v17, v0, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v1, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v15 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v4, v0 +; CGP-NEXT: v_mul_lo_u32 v10, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v0 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v1 +; CGP-NEXT: v_mul_lo_u32 v13, v1, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v0, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v1, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v10, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v12, v1, v4 +; CGP-NEXT: v_mul_hi_u32 v15, v0, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v4, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v6, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 ; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v8, v1 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v1 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v1 +; CGP-NEXT: v_mul_lo_u32 v11, v8, v1 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v1 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v1 ; CGP-NEXT: v_mul_hi_u32 v1, v9, v1 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v2, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v3, v0 -; CGP-NEXT: v_mul_hi_u32 v13, v2, v0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v11, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v12, v2, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_mul_lo_u32 v4, v2, v1 -; CGP-NEXT: v_add_i32_e32 v14, vcc, 1, v0 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v14 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v9, v4, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v0 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v13 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v9, v4, vcc ; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v9, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v3, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v3 -; CGP-NEXT: v_cndmask_b32_e32 v8, v13, v9, vcc -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v6, v2 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v3 +; CGP-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v2 ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v2, v14, v12, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v15, v16, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CGP-NEXT: v_cndmask_b32_e32 v2, v13, v11, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v14, v15, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow2 -; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] +; CGP-NEXT: s_or_saveexec_b64 s[8:9], s[8:9] +; CGP-NEXT: v_lshl_b64 v[9:10], s[6:7], v6 +; CGP-NEXT: s_xor_b64 exec, exec, s[8:9] ; CGP-NEXT: s_cbranch_execz .LBB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 @@ -1668,8 +1668,8 @@ ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: .LBB8_4: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_or_b32_e32 v3, v7, v11 +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] +; CGP-NEXT: v_or_b32_e32 v3, v7, v10 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -1677,10 +1677,10 @@ ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_6 ; CGP-NEXT: ; %bb.5: -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v11 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v10 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v11, vcc +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v10 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v9 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v10, vcc ; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 @@ -1690,13 +1690,13 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v8, v4, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v12, v6, v2 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v2 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_mul_lo_u32 v12, v3, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v2, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v3, v11 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; CGP-NEXT: v_mul_lo_u32 v13, v2, v8 ; CGP-NEXT: v_mul_lo_u32 v15, v3, v8 @@ -1704,46 +1704,46 @@ ; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v15, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v16 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v6, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v9, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 ; CGP-NEXT: v_mul_lo_u32 v12, v3, v8 ; CGP-NEXT: v_mul_hi_u32 v13, v2, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_mul_lo_u32 v6, v2, v4 -; CGP-NEXT: v_mul_lo_u32 v9, v3, v4 +; CGP-NEXT: v_mul_lo_u32 v11, v3, v4 ; CGP-NEXT: v_mul_hi_u32 v14, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc @@ -1751,50 +1751,50 @@ ; CGP-NEXT: v_mul_hi_u32 v6, v5, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v7, v2 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v7, v3 ; CGP-NEXT: v_mul_hi_u32 v12, v5, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v7, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v2 -; CGP-NEXT: v_mul_lo_u32 v8, v11, v2 -; CGP-NEXT: v_mul_hi_u32 v9, v10, v2 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v2 +; CGP-NEXT: v_mul_lo_u32 v8, v10, v2 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v9, v3 ; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v2 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 ; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v7, v4, vcc ; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v11, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v11 -; CGP-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v10 +; CGP-NEXT: v_cndmask_b32_e32 v6, v11, v7, vcc +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v11 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v11 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc @@ -1802,15 +1802,15 @@ ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: .LBB8_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] ; CGP-NEXT: s_cbranch_execz .LBB8_8 ; CGP-NEXT: ; %bb.7: -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -1818,15 +1818,15 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v10 +; CGP-NEXT: v_mul_lo_u32 v3, v2, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v10 +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v9 ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: .LBB8_8: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -2042,21 +2042,20 @@ ; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_mov_b32_e32 v5, v2 ; CGP-NEXT: v_mov_b32_e32 v7, v3 -; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 -; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 -; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 +; CGP-NEXT: s_mov_b64 s[6:7], 0x1000 +; CGP-NEXT: v_lshl_b64 v[2:3], s[6:7], v4 ; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; CGP-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 ; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2065,122 +2064,123 @@ ; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v1 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v6, v0 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v0 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_lo_u32 v14, v1, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v1, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CGP-NEXT: v_mul_lo_u32 v15, v0, v12 -; CGP-NEXT: v_mul_lo_u32 v17, v1, v12 -; CGP-NEXT: v_mul_hi_u32 v18, v0, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v1, v12 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v16 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v4, v1 ; CGP-NEXT: v_mul_lo_u32 v12, v4, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v6, v0 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v0 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v1 -; CGP-NEXT: v_mul_lo_u32 v14, v1, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v13, v1, v12 ; CGP-NEXT: v_mul_hi_u32 v15, v0, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v1, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_mul_lo_u32 v6, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v13, v1, v4 -; CGP-NEXT: v_mul_hi_u32 v16, v0, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_mul_lo_u32 v14, v0, v11 +; CGP-NEXT: v_mul_lo_u32 v16, v1, v11 +; CGP-NEXT: v_mul_hi_u32 v17, v0, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v1, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v15 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v4, v0 +; CGP-NEXT: v_mul_lo_u32 v10, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v0 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v1 +; CGP-NEXT: v_mul_lo_u32 v13, v1, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v0, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v1, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v10, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v12, v1, v4 +; CGP-NEXT: v_mul_hi_u32 v15, v0, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v4, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v6, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 ; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v8, v1 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v1 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v1 +; CGP-NEXT: v_mul_lo_u32 v11, v8, v1 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v1 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v1 ; CGP-NEXT: v_mul_hi_u32 v1, v9, v1 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v2, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v3, v0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v11, v3, v0 ; CGP-NEXT: v_mul_hi_u32 v0, v2, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_mul_lo_u32 v1, v2, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v6 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v10 ; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v0, vcc ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v9, v0 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v1, v2 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v1, v2 +; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v12, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow2 -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[8:9] +; CGP-NEXT: v_lshl_b64 v[9:10], s[6:7], v6 ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_4 ; CGP-NEXT: ; %bb.3: @@ -2204,7 +2204,7 @@ ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_or_b32_e32 v3, v7, v11 +; CGP-NEXT: v_or_b32_e32 v3, v7, v10 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -2212,10 +2212,10 @@ ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_6 ; CGP-NEXT: ; %bb.5: -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v11 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v10 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v11, vcc +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v10 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v9 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v10, vcc ; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 @@ -2225,13 +2225,13 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v8, v4, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v12, v6, v2 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v2 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_mul_lo_u32 v12, v3, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v2, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v3, v11 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; CGP-NEXT: v_mul_lo_u32 v13, v2, v8 ; CGP-NEXT: v_mul_lo_u32 v15, v3, v8 @@ -2239,46 +2239,46 @@ ; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v15, v9 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v16 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v6, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v9, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 ; CGP-NEXT: v_mul_lo_u32 v12, v3, v8 ; CGP-NEXT: v_mul_hi_u32 v13, v2, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_mul_lo_u32 v6, v2, v4 -; CGP-NEXT: v_mul_lo_u32 v9, v3, v4 +; CGP-NEXT: v_mul_lo_u32 v11, v3, v4 ; CGP-NEXT: v_mul_hi_u32 v14, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc @@ -2286,65 +2286,65 @@ ; CGP-NEXT: v_mul_hi_u32 v6, v5, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v7, v2 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v7, v3 ; CGP-NEXT: v_mul_hi_u32 v12, v5, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v7, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v2 -; CGP-NEXT: v_mul_lo_u32 v8, v11, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v10, v2 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v2 +; CGP-NEXT: v_mul_lo_u32 v8, v10, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v9, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_lo_u32 v3, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v3, v9, v3 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v6 ; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v2, vcc ; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v7, v2 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v11 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v11, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v11 +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v3, v10 +; CGP-NEXT: v_sub_i32_e32 v6, vcc, v3, v9 ; CGP-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v2, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v9 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v11, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e32 v10, vcc, v6, v10 +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v6, v9 ; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v11 -; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v10 +; CGP-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc -; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: .LBB8_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_8 ; CGP-NEXT: ; %bb.7: -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -2352,13 +2352,13 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v2, v2, v10 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v9 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v9 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v9 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: .LBB8_8: diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -156,18 +156,18 @@ ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: -; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 - -; GFX9-MUBUF: v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32 -; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32 - ; GCN: s_and_saveexec_b64 ; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}} -; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]] - ; GFX9-MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}} ; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}} + +; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 +; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]] + +; GFX9-MUBUF: v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32 +; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32 + ; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]] ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]] diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir @@ -0,0 +1,734 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX9 %s + +--- +name: test_sink_fmac_to_only_use +alignment: 1 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GFX9-LABEL: name: test_sink_fmac_to_only_use + ; GFX9: bb.0: + ; GFX9-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX9-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]] + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]] + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: S_BRANCH %bb.1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1: + ; GFX9-NEXT: successors: %bb.2(0x80000000) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: %9:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: %12:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %9, %10, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2: + ; GFX9-NEXT: successors: %bb.3(0x80000000) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1 + ; GFX9-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1 + ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.3: + ; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]] + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = COPY $vgpr1 + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64 = S_MOV_B64 0 + %6:sreg_64 = S_MOV_B64 0 + %7:vreg_64 = COPY %5 + %8:vreg_64 = COPY %6 + %9:vgpr_32 = GLOBAL_LOAD_DWORD killed %7, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %11:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %10, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %12:vgpr_32 = GLOBAL_LOAD_DWORD killed %8, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec + %14:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %13, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec + %15:vgpr_32(s32) = COPY $vgpr0 + %16:sreg_32 = S_MOV_B32 1 + %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec + %18:sreg_64 = COPY %17 + %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %20:vgpr_32 = V_ADD_F32_e32 %10, %11, implicit $mode, implicit $exec + %21:vgpr_32 = V_ADD_F32_e32 %13, %14, implicit $mode, implicit $exec + + bb.2: + %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1 + %23:vgpr_32 = PHI %4, %bb.0, %21, %bb.1 + SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + + bb.3: + S_ENDPGM 0, implicit %22, implicit %23 +... +--- +name: test_no_sink_into_if_cond_multiple_uses +alignment: 1 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GFX9-LABEL: name: test_no_sink_into_if_cond_multiple_uses + ; GFX9: bb.0: + ; GFX9-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX9-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]] + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]] + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: %9:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: %12:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: S_BRANCH %bb.1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1: + ; GFX9-NEXT: successors: %bb.2(0x80000000) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %9, %10, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2: + ; GFX9-NEXT: successors: %bb.3(0x80000000) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1 + ; GFX9-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1 + ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.3: + ; GFX9-NEXT: [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %13, %10, implicit $mode, implicit $exec + ; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]] + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = COPY $vgpr1 + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64 = S_MOV_B64 0 + %6:sreg_64 = S_MOV_B64 0 + %7:vreg_64 = COPY %5 + %8:vreg_64 = COPY %6 + %9:vgpr_32 = GLOBAL_LOAD_DWORD killed %7, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %11:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %10, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %12:vgpr_32 = GLOBAL_LOAD_DWORD killed %8, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec + %14:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %13, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec + %15:vgpr_32(s32) = COPY $vgpr0 + %16:sreg_32 = S_MOV_B32 1 + %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec + %18:sreg_64 = COPY %17 + %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %20:vgpr_32 = V_ADD_F32_e32 %10, %11, implicit $mode, implicit $exec + %21:vgpr_32 = V_ADD_F32_e32 %13, %14, implicit $mode, implicit $exec + + bb.2: + %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1 + %23:vgpr_32 = PHI %4, %bb.0, %21, %bb.1 + SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + + bb.3: + %24:vgpr_32 = V_ADD_F32_e32 %14, %11, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %22, implicit %23 +... +--- +name: no_sink_fmac_not_constant_mode +alignment: 1 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GFX9-LABEL: name: no_sink_fmac_not_constant_mode + ; GFX9: bb.0: + ; GFX9-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: $mode = IMPLICIT_DEF + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX9-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]] + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]] + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: %9:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: %12:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: S_BRANCH %bb.1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1: + ; GFX9-NEXT: successors: %bb.2(0x80000000) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %9, %10, implicit $mode, implicit $exec + ; GFX9-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2: + ; GFX9-NEXT: successors: %bb.3(0x80000000) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1 + ; GFX9-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1 + ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.3: + ; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]] + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + $mode = IMPLICIT_DEF + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = COPY $vgpr1 + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64 = S_MOV_B64 0 + %6:sreg_64 = S_MOV_B64 0 + %7:vreg_64 = COPY %5 + %8:vreg_64 = COPY %6 + %9:vgpr_32 = GLOBAL_LOAD_DWORD killed %7, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %11:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %10, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %12:vgpr_32 = GLOBAL_LOAD_DWORD killed %8, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec + %14:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %13, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec + %15:vgpr_32(s32) = COPY $vgpr0 + %16:sreg_32 = S_MOV_B32 1 + %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec + %18:sreg_64 = COPY %17 + %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %20:vgpr_32 = V_ADD_F32_e32 %10, %11, implicit $mode, implicit $exec + %21:vgpr_32 = V_ADD_F32_e32 %13, %14, implicit $mode, implicit $exec + + bb.2: + %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1 + %23:vgpr_32 = PHI %4, %bb.0, %21, %bb.1 + SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + + bb.3: + S_ENDPGM 0, implicit %22, implicit %23 +... +--- +name: test_no_sink_fmac_wwm +alignment: 1 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GFX9-LABEL: name: test_no_sink_fmac_wwm + ; GFX9: bb.0: + ; GFX9-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]] + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: %5:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: early-clobber %6:vgpr_32 = STRICT_WWM %5, implicit $exec + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY3]](s32), [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: S_BRANCH %bb.1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1: + ; GFX9-NEXT: successors: %bb.2(0x80000000) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2: + ; GFX9-NEXT: successors: %bb.3(0x80000000) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0, implicit %5 + ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.3: + ; GFX9-NEXT: S_ENDPGM 0, implicit %6 + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = COPY $vgpr1 + + %20:sreg_64 = S_MOV_B64 0 + %30:vreg_64 = COPY %20 + %29:vgpr_32 = GLOBAL_LOAD_DWORD killed %30, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %29, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec + %9:vgpr_32 = STRICT_WWM %6, implicit $exec + + %16:vgpr_32(s32) = COPY $vgpr0 + %23:sreg_32 = S_MOV_B32 1 + %24:sreg_64 = V_CMP_LT_I32_e64 %16(s32), %23, implicit $exec + %0:sreg_64 = COPY %24 + %5:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + + bb.2: + S_NOP 0, implicit %6 + SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + + bb.3: + S_ENDPGM 0, implicit %9 +... +--- +name: test_def_and_use_in_loop_sink_fmac +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GFX9-LABEL: name: test_def_and_use_in_loop_sink_fmac + ; GFX9: bb.0.entry: + ; GFX9-NEXT: successors: %bb.1(0x80000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1: + ; GFX9-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]] + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: S_BRANCH %bb.2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2: + ; GFX9-NEXT: successors: %bb.3(0x80000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.3: + ; GFX9-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: %8:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: S_NOP 0, implicit %6, implicit %8 + ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.4: + ; GFX9-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.5: + ; GFX9-NEXT: successors: %bb.6(0x80000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.6: + ; GFX9-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.7: + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_ENDPGM 0 + bb.0.entry: + successors: %bb.1(0x80000000) + + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + %101:vgpr_32 = COPY $vgpr0 + %102:vgpr_32 = COPY $vgpr1 + %15:vreg_64 = COPY $vgpr2_vgpr3 + + bb.1: + successors: %bb.2(0x40000000), %bb.3(0x40000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + %20:sreg_64 = S_MOV_B64 0 + %30:vreg_64 = COPY %20 + %29:vgpr_32 = GLOBAL_LOAD_DWORD %30, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %29, 0, %101, 0, %102, 0, 0, implicit $mode, implicit $exec + %31:vgpr_32 = GLOBAL_LOAD_DWORD %15, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %7:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %31, 0, %101, 0, %102, 0, 0, implicit $mode, implicit $exec + %16:vgpr_32(s32) = COPY $vgpr0 + %23:sreg_32 = S_MOV_B32 1 + %24:sreg_64 = V_CMP_LT_I32_e64 %16(s32), %23, implicit $exec + %0:sreg_64 = COPY %24 + %5:sreg_64 = SI_IF %0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3(0x80000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_NOP 0 + + bb.3: + successors: %bb.4(0x40000000), %bb.6(0x40000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_NOP 0, implicit %6, implicit %7 + SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_CBRANCH_EXECZ %bb.6, implicit $exec + + bb.4: + successors: %bb.5(0x04000000), %bb.4(0x7c000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_NOP 0 + S_CBRANCH_EXECZ %bb.4, implicit $exec + + bb.5: + successors: %bb.6(0x80000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_NOP 0 + + bb.6: + successors: %bb.7(0x04000000), %bb.1(0x7c000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_CBRANCH_VCCZ %bb.1, implicit $vcc + + bb.7: + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + S_ENDPGM 0 +... +--- +name: test_no_sink_def_into_loop +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GFX9-LABEL: name: test_no_sink_def_into_loop + ; GFX9: bb.0.entry: + ; GFX9-NEXT: successors: %bb.1(0x80000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]] + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: %8:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1: + ; GFX9-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0, implicit %6, implicit %8 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: S_BRANCH %bb.2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2: + ; GFX9-NEXT: successors: %bb.3(0x80000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.3: + ; GFX9-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.4: + ; GFX9-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.5: + ; GFX9-NEXT: successors: %bb.6(0x80000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.6: + ; GFX9-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.7: + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_ENDPGM 0 + bb.0.entry: + successors: %bb.1(0x80000000) + + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + %101:vgpr_32 = COPY $vgpr0 + %102:vgpr_32 = COPY $vgpr1 + %15:vreg_64 = COPY $vgpr2_vgpr3 + %20:sreg_64 = S_MOV_B64 0 + %30:vreg_64 = COPY %20 + %29:vgpr_32 = GLOBAL_LOAD_DWORD killed %30, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %29, 0, %101, 0, %102, 0, 0, implicit $mode, implicit $exec + %31:vgpr_32 = GLOBAL_LOAD_DWORD killed %15, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %7:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %31, 0, %101, 0, %102, 0, 0, implicit $mode, implicit $exec + + bb.1: + successors: %bb.2(0x40000000), %bb.3(0x40000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_NOP 0, implicit %6, implicit %7 + %16:vgpr_32(s32) = COPY $vgpr0 + %23:sreg_32 = S_MOV_B32 1 + %24:sreg_64 = V_CMP_LT_I32_e64 %16(s32), %23, implicit $exec + %0:sreg_64 = COPY %24 + %5:sreg_64 = SI_IF %0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3(0x80000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_NOP 0 + + bb.3: + successors: %bb.4(0x40000000), %bb.6(0x40000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_CBRANCH_EXECZ %bb.6, implicit $exec + + bb.4: + successors: %bb.5(0x04000000), %bb.4(0x7c000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_NOP 0 + S_CBRANCH_EXECZ %bb.4, implicit $exec + + bb.5: + successors: %bb.6(0x80000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_NOP 0 + + bb.6: + successors: %bb.7(0x04000000), %bb.1(0x7c000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_CBRANCH_VCCZ %bb.1, implicit $vcc + + bb.7: + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + S_ENDPGM 0 +... +--- +name: test_no_sink_def_into_loop2 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GFX9-LABEL: name: test_no_sink_def_into_loop2 + ; GFX9: bb.0.entry: + ; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]] + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: %8:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX9-NEXT: S_BRANCH %bb.1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1: + ; GFX9-NEXT: successors: %bb.2(0x80000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: S_BRANCH %bb.2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2: + ; GFX9-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0, implicit %6, implicit %8 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: S_BRANCH %bb.3 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.3: + ; GFX9-NEXT: successors: %bb.4(0x80000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: S_BRANCH %bb.4 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.4: + ; GFX9-NEXT: successors: %bb.5(0x40000000), %bb.7(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec + ; GFX9-NEXT: S_BRANCH %bb.5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.5: + ; GFX9-NEXT: successors: %bb.6(0x04000000), %bb.5(0x7c000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GFX9-NEXT: S_BRANCH %bb.6 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.6: + ; GFX9-NEXT: successors: %bb.7(0x80000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: S_BRANCH %bb.7 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.7: + ; GFX9-NEXT: successors: %bb.8(0x04000000), %bb.2(0x7c000000) + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_CBRANCH_VCCZ %bb.2, implicit $vcc + ; GFX9-NEXT: S_BRANCH %bb.8 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.8: + ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_ENDPGM 0 + bb.0.entry: + successors: %bb.1(0x40000000), %bb.2 (0x40000000) + + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + %101:vgpr_32 = COPY $vgpr0 + %102:vgpr_32 = COPY $vgpr1 + %15:vreg_64 = COPY $vgpr2_vgpr3 + %20:sreg_64 = S_MOV_B64 0 + %30:vreg_64 = COPY %20 + %29:vgpr_32 = GLOBAL_LOAD_DWORD killed %30, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %29, 0, %101, 0, %102, 0, 0, implicit $mode, implicit $exec + %31:vgpr_32 = GLOBAL_LOAD_DWORD killed %15, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %7:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %31, 0, %101, 0, %102, 0, 0, implicit $mode, implicit $exec + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + S_NOP 0 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3(0x40000000), %bb.4(0x40000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_NOP 0, implicit %6, implicit %7 + %16:vgpr_32(s32) = COPY $vgpr0 + %23:sreg_32 = S_MOV_B32 1 + %24:sreg_64 = V_CMP_LT_I32_e64 %16(s32), %23, implicit $exec + %0:sreg_64 = COPY %24 + %5:sreg_64 = SI_IF %0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4(0x80000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_NOP 0 + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5(0x40000000), %bb.7(0x40000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_CBRANCH_EXECZ %bb.7, implicit $exec + S_BRANCH %bb.5 + + bb.5: + successors: %bb.6(0x04000000), %bb.5(0x7c000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_NOP 0 + S_CBRANCH_EXECZ %bb.5, implicit $exec + S_BRANCH %bb.6 + + bb.6: + successors: %bb.7(0x80000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_NOP 0 + S_BRANCH %bb.7 + + bb.7: + successors: %bb.8(0x04000000), %bb.2(0x7c000000) + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + + S_CBRANCH_VCCZ %bb.2, implicit $vcc + S_BRANCH %bb.8 + + bb.8: + liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -54,17 +54,17 @@ ; GFX9-LABEL: lsr_order_mul24_1: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v5, 1, v18 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-NEXT: ; %bb.1: ; %bb19 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v6 -; GFX9-NEXT: v_lshl_add_u32 v6, v4, 2, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 1, v18 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX9-NEXT: v_lshl_add_u32 v7, v4, 2, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v2 ; GFX9-NEXT: v_add_u32_e32 v9, v17, v12 ; GFX9-NEXT: s_mov_b64 s[10:11], 0 @@ -76,7 +76,7 @@ ; GFX9-NEXT: v_add_u32_e32 v12, v17, v0 ; GFX9-NEXT: v_add_u32_e32 v19, v9, v0 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac +; GFX9-NEXT: v_madak_f32 v3, v3, v6, 0x3727c5ac ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v16 @@ -97,8 +97,8 @@ ; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] -; GFX9-NEXT: ds_write_b32 v6, v3 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 +; GFX9-NEXT: ds_write_b32 v7, v3 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_cbranch_execnz .LBB1_2 ; GFX9-NEXT: .LBB1_3: ; %Flow3 diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1373,7 +1373,6 @@ ; SI-LABEL: complex_loop: ; SI: ; %bb.0: ; %.entry ; SI-NEXT: s_cmp_lt_i32 s0, 1 -; SI-NEXT: v_mov_b32_e32 v2, -1 ; SI-NEXT: s_cbranch_scc1 .LBB15_7 ; SI-NEXT: ; %bb.1: ; %.lr.ph ; SI-NEXT: s_mov_b64 s[2:3], exec @@ -1405,7 +1404,10 @@ ; SI-NEXT: s_branch .LBB15_2 ; SI-NEXT: .LBB15_6: ; %Flow ; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: .LBB15_7: ; %._crit_edge +; SI-NEXT: exp mrt0 v2, v2, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB15_7: +; SI-NEXT: v_mov_b32_e32 v2, -1 ; SI-NEXT: exp mrt0 v2, v2, v0, v0 done vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB15_8: @@ -1415,7 +1417,6 @@ ; ; GFX10-WAVE64-LABEL: complex_loop: ; GFX10-WAVE64: ; %bb.0: ; %.entry -; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, -1 ; GFX10-WAVE64-NEXT: s_cmp_lt_i32 s0, 1 ; GFX10-WAVE64-NEXT: s_cbranch_scc1 .LBB15_7 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %.lr.ph @@ -1448,7 +1449,10 @@ ; GFX10-WAVE64-NEXT: s_branch .LBB15_2 ; GFX10-WAVE64-NEXT: .LBB15_6: ; %Flow ; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10-WAVE64-NEXT: .LBB15_7: ; %._crit_edge +; GFX10-WAVE64-NEXT: exp mrt0 v2, v2, v0, v0 done vm +; GFX10-WAVE64-NEXT: s_endpgm +; GFX10-WAVE64-NEXT: .LBB15_7: +; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, -1 ; GFX10-WAVE64-NEXT: exp mrt0 v2, v2, v0, v0 done vm ; GFX10-WAVE64-NEXT: s_endpgm ; GFX10-WAVE64-NEXT: .LBB15_8: @@ -1458,7 +1462,6 @@ ; ; GFX10-WAVE32-LABEL: complex_loop: ; GFX10-WAVE32: ; %bb.0: ; %.entry -; GFX10-WAVE32-NEXT: v_mov_b32_e32 v2, -1 ; GFX10-WAVE32-NEXT: s_cmp_lt_i32 s0, 1 ; GFX10-WAVE32-NEXT: s_cbranch_scc1 .LBB15_7 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %.lr.ph @@ -1491,7 +1494,10 @@ ; GFX10-WAVE32-NEXT: s_branch .LBB15_2 ; GFX10-WAVE32-NEXT: .LBB15_6: ; %Flow ; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-WAVE32-NEXT: .LBB15_7: ; %._crit_edge +; GFX10-WAVE32-NEXT: exp mrt0 v2, v2, v0, v0 done vm +; GFX10-WAVE32-NEXT: s_endpgm +; GFX10-WAVE32-NEXT: .LBB15_7: +; GFX10-WAVE32-NEXT: v_mov_b32_e32 v2, -1 ; GFX10-WAVE32-NEXT: exp mrt0 v2, v2, v0, v0 done vm ; GFX10-WAVE32-NEXT: s_endpgm ; GFX10-WAVE32-NEXT: .LBB15_8: diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -20,7 +20,6 @@ ; MUBUF-NEXT: v_mov_b32_e32 v3, 0 ; MUBUF-NEXT: v_mov_b32_e32 v4, 0x400000 ; MUBUF-NEXT: s_mov_b32 s32, 0xc0000 -; MUBUF-NEXT: v_add_nc_u32_e64 v40, 4, 0x4000 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12 @@ -33,11 +32,12 @@ ; MUBUF-NEXT: s_and_saveexec_b32 s0, vcc_lo ; MUBUF-NEXT: s_cbranch_execz .LBB0_2 ; MUBUF-NEXT: ; %bb.1: ; %if.then4.i +; MUBUF-NEXT: v_add_nc_u32_e64 v0, 4, 0x4000 ; MUBUF-NEXT: s_clause 0x1 -; MUBUF-NEXT: buffer_load_dword v0, v40, s[36:39], 0 offen -; MUBUF-NEXT: buffer_load_dword v1, v40, s[36:39], 0 offen offset:4 +; MUBUF-NEXT: buffer_load_dword v1, v0, s[36:39], 0 offen +; MUBUF-NEXT: buffer_load_dword v2, v0, s[36:39], 0 offen offset:4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; MUBUF-NEXT: v_add_nc_u32_e32 v0, v2, v1 ; MUBUF-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 ; MUBUF-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0 ; MUBUF-NEXT: buffer_store_dword v0, v0, s[36:39], 0 offen