diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -343,6 +343,21 @@ (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }]) >; +def commute_constant_to_rhs : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_ADD, G_MUL, G_AND, G_OR, G_XOR):$root, [{ + return getIConstantVRegVal(${root}->getOperand(1).getReg(), MRI).has_value(); + }]), + (apply [{ + Observer.changingInstr(*${root}); + Register LHSReg = ${root}->getOperand(1).getReg(); + Register RHSReg = ${root}->getOperand(2).getReg(); + ${root}->getOperand(1).setReg(RHSReg); + ${root}->getOperand(2).setReg(LHSReg); + Observer.changedInstr(*${root}); + }]) +>; + // Fold x op 0 -> x def right_identity_zero: GICombineRule< (defs root:$root), @@ -1086,7 +1101,7 @@ intdiv_combines, mulh_combines, redundant_neg_operands, and_or_disjoint_mask, fma_combines, fold_binop_into_select, sub_add_reg, select_to_minmax, redundant_binop_in_equality, - fsub_to_fneg]>; + fsub_to_fneg, commute_constant_to_rhs]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add-of-sub.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add-of-sub.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add-of-sub.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add-of-sub.mir @@ -163,7 +163,7 @@ ; CHECK-NEXT: %x1:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: %x2:_(s32) = G_CONSTANT i32 2 ; CHECK-NEXT: %y:_(s32) = COPY $w1 - ; CHECK-NEXT: %add:_(s32) = G_ADD %x1, %y + ; CHECK-NEXT: %add:_(s32) = G_ADD %y, %x1 ; CHECK-NEXT: %sub:_(s32) = G_SUB %add, %x2 ; CHECK-NEXT: $w0 = COPY %sub(s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 @@ -351,7 +351,7 @@ ; CHECK-NEXT: %x1:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: %x2:_(s32) = G_CONSTANT i32 2 ; CHECK-NEXT: %y:_(s32) = COPY $w1 - ; CHECK-NEXT: %add:_(s32) = G_ADD %x1, %y + ; CHECK-NEXT: %add:_(s32) = G_ADD %y, %x1 ; CHECK-NEXT: %sub:_(s32) = G_SUB %x2, %add ; CHECK-NEXT: $w0 = COPY %sub(s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir @@ -13,19 +13,19 @@ ; CHECK: liveins: $x0 ; CHECK: [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF ; CHECK: [[DEF1:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 9 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9 ; CHECK: G_BRCOND [[DEF]](s1), %bb.2 ; CHECK: G_BR %bb.1 ; CHECK: bb.1: ; CHECK: successors: ; CHECK: bb.2: ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p0) :: (load (s32) from `ptr undef`, align 8) - ; CHECK: [[MUL:%[0-9]+]]:_(s32) = nsw G_MUL [[C]], [[LOAD]] - ; CHECK: [[MUL1:%[0-9]+]]:_(s32) = nsw G_MUL [[MUL]], [[C1]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = nsw G_SHL [[LOAD]], [[C1]](s32) + ; CHECK: [[MUL:%[0-9]+]]:_(s32) = nsw G_MUL [[SHL]], [[C]] ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[MUL1]], [[C2]](s64) - ; CHECK: $w0 = COPY [[SHL]](s32) + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[MUL]], [[C2]](s64) + ; CHECK: $w0 = COPY [[SHL1]](s32) ; CHECK: RET_ReallyLR implicit $w0 bb.1: liveins: $x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir @@ -46,7 +46,7 @@ ; CHECK-NEXT: %twenty:_(s32) = G_CONSTANT i32 20 ; CHECK-NEXT: %select:_(s32) = G_SELECT %cond(s1), %ten, %twenty ; CHECK-NEXT: %thirty:_(s32) = G_CONSTANT i32 30 - ; CHECK-NEXT: %add:_(s32) = G_ADD %thirty, %select + ; CHECK-NEXT: %add:_(s32) = G_ADD %select, %thirty ; CHECK-NEXT: S_ENDPGM 0, implicit %add(s32), implicit %select(s32) %reg:_(s32) = COPY $vgpr0 %zero:_(s32) = G_CONSTANT i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -14,7 +14,7 @@ ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, -7, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -45,7 +45,7 @@ ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: v_mul_lo_u32 v1, -7, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -76,7 +76,7 @@ ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: v_mul_lo_u32 v1, -7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -107,7 +107,7 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, -7, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -141,7 +141,7 @@ ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: v_mul_lo_u32 v1, -7, v0 +; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 @@ -183,7 +183,7 @@ ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 6 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, -7, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, v3, -7 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -213,7 +213,7 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_lo_u32 v4, -7, v3 +; GFX8-NEXT: v_mul_lo_u32 v4, v3, -7 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -243,7 +243,7 @@ ; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, -7, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, v3, -7 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -274,7 +274,7 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, -7, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, v3, -7 ; GFX10-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -308,7 +308,7 @@ ; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX11-NEXT: v_mul_lo_u32 v4, -7, v3 +; GFX11-NEXT: v_mul_lo_u32 v4, v3, -7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4 @@ -1513,7 +1513,7 @@ ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -1543,7 +1543,7 @@ ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -1573,7 +1573,7 @@ ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -1673,7 +1673,7 @@ ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, v4, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -1703,7 +1703,7 @@ ; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_lo_u32 v4, v4, v3 +; GFX8-NEXT: v_mul_lo_u32 v4, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -1733,7 +1733,7 @@ ; GFX9-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -1854,7 +1854,7 @@ ; GFX6-NEXT: s_lshr_b32 s7, s2, 24 ; GFX6-NEXT: s_and_b32 s9, s2, 0xff ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_or_b32 s2, s9, s2 @@ -1892,7 +1892,7 @@ ; GFX6-NEXT: s_lshr_b32 s8, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX6-NEXT: s_lshl_b32 s5, s5, 8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 @@ -1988,7 +1988,7 @@ ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX8-NEXT: s_lshr_b32 s9, s3, 8 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 @@ -2023,7 +2023,7 @@ ; GFX8-NEXT: s_lshr_b32 s9, s5, 8 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff -; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX8-NEXT: s_lshl_b32 s5, s5, s10 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 @@ -2099,7 +2099,7 @@ ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX9-NEXT: s_or_b32 s0, s0, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, s12 ; GFX9-NEXT: s_and_b32 s7, s11, 0xff @@ -2146,7 +2146,7 @@ ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: s_lshr_b32 s11, s5, 8 @@ -2485,14 +2485,14 @@ ; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX6-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX6-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 ; GFX6-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v9 ; GFX6-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX6-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX6-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 @@ -2539,14 +2539,14 @@ ; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX8-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX8-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX8-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 ; GFX8-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v9 ; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX8-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 @@ -2593,10 +2593,10 @@ ; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 ; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GFX9-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX9-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX9-NEXT: v_mul_lo_u32 v7, v7, v9 +; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7 ; GFX9-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX9-NEXT: v_bfe_u32 v3, v3, 1, 23 ; GFX9-NEXT: v_mul_hi_u32 v7, v9, v7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -15,7 +15,7 @@ ; GFX6-NEXT: s_and_b32 s1, s1, 0x7f ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, -7, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -45,7 +45,7 @@ ; GFX8-NEXT: s_and_b32 s1, s1, 0x7f ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, -7, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -75,7 +75,7 @@ ; GFX9-NEXT: s_and_b32 s1, s1, 0x7f ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, -7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -105,7 +105,7 @@ ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, -7, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -138,7 +138,7 @@ ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v1, -7, v0 +; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 @@ -180,7 +180,7 @@ ; GFX6-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, -7, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, v3, -7 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -210,7 +210,7 @@ ; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_lo_u32 v4, -7, v3 +; GFX8-NEXT: v_mul_lo_u32 v4, v3, -7 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -240,7 +240,7 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, -7, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, v3, -7 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -271,7 +271,7 @@ ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, -7, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, v3, -7 ; GFX10-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -305,7 +305,7 @@ ; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v4, -7, v3 +; GFX11-NEXT: v_mul_lo_u32 v4, v3, -7 ; GFX11-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4 @@ -1513,7 +1513,7 @@ ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffffff -; GFX6-NEXT: v_mul_lo_u32 v1, v1, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -1544,7 +1544,7 @@ ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffffff -; GFX8-NEXT: v_mul_lo_u32 v1, v1, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -1575,7 +1575,7 @@ ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -1678,7 +1678,7 @@ ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX6-NEXT: v_mul_lo_u32 v4, v4, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -1709,7 +1709,7 @@ ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX8-NEXT: v_mul_lo_u32 v4, v4, v3 +; GFX8-NEXT: v_mul_lo_u32 v4, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -1740,7 +1740,7 @@ ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -1854,7 +1854,7 @@ ; GFX6-NEXT: s_lshr_b32 s9, s2, 24 ; GFX6-NEXT: s_and_b32 s11, s2, 0xff ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s8, s8, 0xff ; GFX6-NEXT: s_or_b32 s2, s11, s2 @@ -1892,7 +1892,7 @@ ; GFX6-NEXT: s_lshr_b32 s10, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX6-NEXT: s_lshl_b32 s5, s5, 8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 @@ -1989,7 +1989,7 @@ ; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX8-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX8-NEXT: s_lshr_b32 s12, s3, 8 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 @@ -2024,7 +2024,7 @@ ; GFX8-NEXT: s_lshr_b32 s12, s5, 8 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff -; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX8-NEXT: s_lshl_b32 s5, s5, s10 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 @@ -2099,7 +2099,7 @@ ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX9-NEXT: s_lshl_b32 s1, s1, s12 ; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 @@ -2145,7 +2145,7 @@ ; GFX9-NEXT: s_or_b32 s4, s4, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s10 @@ -2496,7 +2496,7 @@ ; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX6-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX6-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 @@ -2513,7 +2513,7 @@ ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v7, v8 +; GFX6-NEXT: v_mul_lo_u32 v6, v8, v7 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX6-NEXT: v_mul_hi_u32 v6, v8, v6 @@ -2552,7 +2552,7 @@ ; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX8-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 @@ -2569,7 +2569,7 @@ ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v7, v8 +; GFX8-NEXT: v_mul_lo_u32 v6, v8, v7 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX8-NEXT: v_mul_hi_u32 v6, v8, v6 @@ -2608,9 +2608,9 @@ ; GFX9-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 ; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX9-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v7, v9 +; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX9-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -678,7 +678,7 @@ ; GFX6-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mul_lo_u32 v1, -2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, v0, -2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100001 ; GFX6-NEXT: s_ashr_i32 s2, s0, 31 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -112,9 +112,8 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 -; GFX10-NEXT: v_mul_lo_u32 v1, v4, v1 -; GFX10-NEXT: v_mul_lo_u32 v0, 0, v0 -; GFX10-NEXT: v_add3_u32 v3, v3, v1, v0 +; GFX10-NEXT: v_mul_lo_u32 v0, v4, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5] ; GFX10-NEXT: s_endpgm @@ -131,11 +130,9 @@ ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v4, v0, 0 -; GFX11-NEXT: v_mul_lo_u32 v1, v4, v1 -; GFX11-NEXT: v_mul_lo_u32 v0, 0, v0 +; GFX11-NEXT: v_mul_lo_u32 v0, v4, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v3, v3, v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -164,8 +161,6 @@ ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, 0 -; GFX10-NEXT: v_mul_lo_u32 v2, 0, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm @@ -176,15 +171,13 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v2, 0 -; GFX11-NEXT: v_mul_lo_u32 v2, 0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_add_nc_u32 v1, v1, v2 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -214,9 +207,8 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 -; GFX10-NEXT: v_mul_lo_u32 v1, v4, v1 -; GFX10-NEXT: v_mul_lo_u32 v0, 0, v0 -; GFX10-NEXT: v_add3_u32 v3, v3, v1, v0 +; GFX10-NEXT: v_mul_lo_u32 v0, v4, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5] ; GFX10-NEXT: s_endpgm @@ -233,11 +225,9 @@ ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v4, v0, 0 -; GFX11-NEXT: v_mul_lo_u32 v1, v4, v1 -; GFX11-NEXT: v_mul_lo_u32 v0, 0, v0 +; GFX11-NEXT: v_mul_lo_u32 v0, v4, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v3, v3, v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-trunc-bitcast-buildvector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-trunc-bitcast-buildvector.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-trunc-bitcast-buildvector.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-trunc-bitcast-buildvector.mir @@ -13,7 +13,7 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 42 - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C]], [[TRUNC]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[C]] ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -48,7 +48,7 @@ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 42 - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C1]], [[TRUNC2]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[C1]] ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -78,7 +78,7 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 42 - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C]], [[TRUNC]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[C]] ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -111,7 +111,7 @@ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s64) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 42 - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C]], [[TRUNC]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[C]] ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32) %0:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -54,13 +54,9 @@ ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v5, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 ; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -130,13 +126,9 @@ ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CGP-NEXT: v_mul_lo_u32 v1, s1, v0 -; CGP-NEXT: v_mul_lo_u32 v2, 0, v1 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_lo_u32 v1, 0, v0 ; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v0, s2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 @@ -247,20 +239,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v10, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v11, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_mul_lo_u32 v10, v5, v3 @@ -302,7 +286,7 @@ ; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -328,7 +312,7 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: s_add_i32 s8, 0x1000, 0 +; GISEL-NEXT: s_add_i32 s8, 0, 0x1000 ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8 @@ -394,8 +378,8 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s4, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v7 +; CGP-NEXT: v_mul_lo_u32 v8, v3, s4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v4 ; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 @@ -443,7 +427,7 @@ ; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -469,7 +453,7 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: s_add_i32 s8, 0x12d8fb, 0 +; GISEL-NEXT: s_add_i32 s8, 0, 0x12d8fb ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8 @@ -535,8 +519,8 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s4, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v7 +; CGP-NEXT: v_mul_lo_u32 v8, v3, s4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v4 ; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 @@ -706,20 +690,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_mul_lo_u32 v6, v6, v5 ; CGP-NEXT: v_mul_lo_u32 v9, v9, v7 -; CGP-NEXT: v_mul_lo_u32 v10, 0, v6 ; CGP-NEXT: v_mul_hi_u32 v6, v5, v6 -; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v0, v5 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v6 ; CGP-NEXT: v_mul_hi_u32 v6, v1, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_mul_lo_u32 v7, v5, v2 ; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5 ; CGP-NEXT: v_mul_lo_u32 v10, v6, v3 @@ -797,13 +773,9 @@ ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -906,20 +878,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -50,13 +50,9 @@ ; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 -; CGP-NEXT: v_mul_lo_u32 v5, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v1 @@ -120,13 +116,9 @@ ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CGP-NEXT: v_mul_lo_u32 v1, s3, v0 -; CGP-NEXT: v_mul_lo_u32 v2, 0, v1 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_lo_u32 v1, 0, v0 ; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, s1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; CGP-NEXT: v_subrev_i32_e32 v1, vcc, s1, v0 @@ -227,20 +219,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v9, v9, v8 -; CGP-NEXT: v_mul_lo_u32 v10, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v5, v7 -; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v0, v5 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v2 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 @@ -278,7 +262,7 @@ ; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -302,7 +286,7 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: s_add_i32 s4, 0x1000, 0 +; GISEL-NEXT: s_add_i32 s4, 0, 0x1000 ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s4 @@ -364,8 +348,8 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s5, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v7 +; CGP-NEXT: v_mul_lo_u32 v8, v3, s5 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v4 ; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 @@ -409,7 +393,7 @@ ; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -433,7 +417,7 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: s_add_i32 s4, 0x12d8fb, 0 +; GISEL-NEXT: s_add_i32 s4, 0, 0x12d8fb ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s4 @@ -495,8 +479,8 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v8, s5, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v7 +; CGP-NEXT: v_mul_lo_u32 v8, v3, s5 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v4 ; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 @@ -651,20 +635,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 ; CGP-NEXT: v_mul_lo_u32 v9, v9, v8 -; CGP-NEXT: v_mul_lo_u32 v10, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v6 ; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_mul_lo_u32 v6, v6, v2 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 @@ -735,13 +711,9 @@ ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 @@ -836,20 +808,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -38,13 +38,9 @@ ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -96,13 +92,9 @@ ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CGP-NEXT: v_mul_lo_u32 v1, s2, v0 -; CGP-NEXT: v_mul_lo_u32 v2, 0, v1 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_lo_u32 v1, 0, v0 ; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v0, s1 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 @@ -179,20 +171,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 @@ -367,20 +351,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 @@ -445,13 +421,9 @@ ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -536,20 +508,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -36,13 +36,9 @@ ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 @@ -90,13 +86,9 @@ ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CGP-NEXT: v_mul_lo_u32 v1, s2, v0 -; CGP-NEXT: v_mul_lo_u32 v2, 0, v1 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_lo_u32 v1, 0, v0 ; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, s1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; CGP-NEXT: v_subrev_i32_e32 v1, vcc, s1, v0 @@ -167,20 +159,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 @@ -232,7 +216,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v2, 0xffed2705 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, v1, v2 ; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -260,7 +244,7 @@ ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v3 +; GISEL-NEXT: v_mul_lo_u32 v4, v3, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v0, v3 @@ -291,7 +275,7 @@ ; CGP-NEXT: s_mov_b32 s5, 0xffed2705 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, s5, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v2, s5 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v0, v2 @@ -404,20 +388,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 @@ -476,13 +452,9 @@ ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 @@ -561,20 +533,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -985,58 +985,56 @@ ; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v6 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s5, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, s5, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, -1, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, s5, v3 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_mul_lo_u32 v7, v6, s5 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, s5 +; CHECK-NEXT: v_mul_hi_u32 v9, s5, v3 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_mul_lo_u32 v9, v6, v8 -; CHECK-NEXT: v_mul_hi_u32 v11, v3, v8 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v8 ; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 +; CHECK-NEXT: v_mul_lo_u32 v11, v3, v7 ; CHECK-NEXT: v_mul_lo_u32 v12, v6, v7 ; CHECK-NEXT: v_mul_hi_u32 v13, v3, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v6, v7 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, s5, v3 -; CHECK-NEXT: v_mul_lo_u32 v8, -1, v3 -; CHECK-NEXT: v_mul_hi_u32 v9, s5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, s5, v6 -; CHECK-NEXT: v_mul_lo_u32 v11, v6, v7 -; CHECK-NEXT: v_mul_hi_u32 v12, v3, v7 +; CHECK-NEXT: v_mul_lo_u32 v7, v3, s5 +; CHECK-NEXT: v_mul_hi_u32 v8, s5, v3 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, s5 +; CHECK-NEXT: v_mul_lo_u32 v10, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v11, v3, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v6, v7 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CHECK-NEXT: v_sub_i32_e32 v9, vcc, v9, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CHECK-NEXT: v_mul_lo_u32 v9, v3, v8 -; CHECK-NEXT: v_mul_lo_u32 v10, v6, v8 +; CHECK-NEXT: v_mul_lo_u32 v12, v6, v8 ; CHECK-NEXT: v_mul_hi_u32 v13, v3, v8 ; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v11 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -1063,12 +1061,10 @@ ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, s4, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, 0, v3 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, s4 ; CHECK-NEXT: v_mul_hi_u32 v3, s4, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_mul_lo_u32 v6, s4, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CHECK-NEXT: v_mul_lo_u32 v6, v6, s4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 ; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v3, vcc @@ -1288,19 +1284,15 @@ ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_lo_u32 v14, s8, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, 0, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, v8, s8 ; GISEL-NEXT: v_mul_hi_u32 v8, s8, v8 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_lo_u32 v12, s8, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, s8 ; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, s8, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, s8 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, s8 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 @@ -1380,121 +1372,117 @@ ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v10 ; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v8, s6 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v16, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v17, -1, v7 -; CGP-NEXT: v_mul_hi_u32 v18, s6, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v10, s6 +; CGP-NEXT: v_mul_lo_u32 v13, v5, s6 +; CGP-NEXT: v_mul_hi_u32 v14, s6, v5 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v5 +; CGP-NEXT: v_mul_lo_u32 v15, v7, s6 +; CGP-NEXT: v_mul_hi_u32 v16, s6, v7 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, v12, v7 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; CGP-NEXT: v_mul_lo_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v19, v5, v13 +; CGP-NEXT: v_mul_hi_u32 v17, v5, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_mul_lo_u32 v17, v10, v16 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v7, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v10, v16 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; CGP-NEXT: v_mul_lo_u32 v18, v7, v12 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_mul_lo_u32 v15, v5, v11 -; CGP-NEXT: v_mul_lo_u32 v17, v8, v11 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_mul_lo_u32 v16, v10, v15 +; CGP-NEXT: v_mul_hi_u32 v18, v7, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v10, v15 +; CGP-NEXT: v_mul_lo_u32 v19, v7, v12 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v19 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; CGP-NEXT: v_mul_lo_u32 v16, v5, v11 +; CGP-NEXT: v_mul_lo_u32 v18, v8, v11 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 ; CGP-NEXT: v_mul_hi_u32 v14, v5, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 -; CGP-NEXT: v_mul_lo_u32 v19, v10, v12 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v17 +; CGP-NEXT: v_mul_lo_u32 v17, v10, v12 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_mul_hi_u32 v18, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v19, v16 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 +; CGP-NEXT: v_mul_hi_u32 v19, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v18, v17 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v19 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v18 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc -; CGP-NEXT: v_mul_lo_u32 v11, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v13, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v16 +; CGP-NEXT: v_mul_lo_u32 v11, v5, s6 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc -; CGP-NEXT: v_mul_lo_u32 v12, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v15, -1, v7 -; CGP-NEXT: v_mul_hi_u32 v16, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v17, s6, v8 -; CGP-NEXT: v_mul_lo_u32 v18, v8, v11 -; CGP-NEXT: v_mul_hi_u32 v19, v5, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v7, s6 +; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v15, v8, s6 +; CGP-NEXT: v_mul_lo_u32 v16, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v17, v5, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; CGP-NEXT: v_mul_lo_u32 v17, s6, v10 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; CGP-NEXT: v_mul_lo_u32 v17, v10, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v18, v10, s6 +; CGP-NEXT: v_mul_lo_u32 v19, v10, v12 +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v5 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_mul_hi_u32 v15, v7, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; CGP-NEXT: v_mul_lo_u32 v16, v7, v15 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_lo_u32 v14, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v16, v8, v13 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; CGP-NEXT: v_mul_hi_u32 v14, v5, v13 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19 -; CGP-NEXT: v_mul_lo_u32 v19, v10, v15 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_mul_hi_u32 v17, v7, v15 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v19, v12 +; CGP-NEXT: v_sub_i32_e32 v18, vcc, v18, v7 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v18, v14 +; CGP-NEXT: v_mul_lo_u32 v18, v7, v14 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v18, v15 +; CGP-NEXT: v_mul_lo_u32 v15, v5, v13 +; CGP-NEXT: v_mul_lo_u32 v18, v8, v13 +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v13 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v17 +; CGP-NEXT: v_mul_lo_u32 v17, v10, v14 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v18, v11 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v18, v15 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; CGP-NEXT: v_mul_hi_u32 v19, v7, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v19 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; CGP-NEXT: v_mov_b32_e32 v19, s7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; CGP-NEXT: v_mov_b32_e32 v18, s9 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v15 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v16 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; CGP-NEXT: v_mov_b32_e32 v16, s9 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc ; CGP-NEXT: v_mul_lo_u32 v11, v1, v5 @@ -1506,8 +1494,8 @@ ; CGP-NEXT: v_mul_hi_u32 v14, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 ; CGP-NEXT: v_mul_lo_u32 v15, v0, v8 -; CGP-NEXT: v_mul_lo_u32 v16, v1, v8 -; CGP-NEXT: v_mul_hi_u32 v17, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v17, v1, v8 +; CGP-NEXT: v_mul_hi_u32 v18, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc @@ -1519,38 +1507,34 @@ ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_mul_hi_u32 v11, v2, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v3, v10 -; CGP-NEXT: v_add_i32_e64 v5, s[6:7], v16, v5 +; CGP-NEXT: v_add_i32_e64 v5, s[6:7], v17, v5 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] ; CGP-NEXT: v_add_i32_e64 v7, s[6:7], v13, v7 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v18 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_lo_u32 v14, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v15, 0, v5 +; CGP-NEXT: v_mul_lo_u32 v14, v5, s8 ; CGP-NEXT: v_mul_hi_u32 v5, s8, v5 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v12, s8, v7 -; CGP-NEXT: v_mul_lo_u32 v16, 0, v7 +; CGP-NEXT: v_mul_lo_u32 v12, v7, s8 ; CGP-NEXT: v_mul_hi_u32 v7, s8, v7 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v8, s8, v8 -; CGP-NEXT: v_mul_lo_u32 v10, s8, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v8, v8, s8 +; CGP-NEXT: v_mul_lo_u32 v10, v10, s8 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 @@ -1582,7 +1566,7 @@ ; CGP-NEXT: v_sub_i32_e32 v11, vcc, v7, v4 ; CGP-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc +; CGP-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v4 ; CGP-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9