Index: llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -294,6 +294,10 @@ bool matchCombineFAbsOfFAbs(MachineInstr &MI, Register &Src); bool applyCombineFAbsOfFAbs(MachineInstr &MI, Register &Src); + /// Transform G_MUL(x, -1) to G_SUB(0, x) + bool matchCombineMulByNegativeOne(MachineInstr &MI, Register &Reg); + bool applyCombineMulByNegativeOne(MachineInstr &MI, Register &Reg); + /// Return true if any explicit use operand on \p MI is defined by a /// G_IMPLICIT_DEF. bool matchAnyExplicitUseIsUndef(MachineInstr &MI); Index: llvm/include/llvm/Target/GlobalISel/Combine.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/Combine.td +++ llvm/include/llvm/Target/GlobalISel/Combine.td @@ -429,6 +429,23 @@ (apply [{ return Helper.applyCombineUnmergeWithDeadLanesToTrunc(*${d}); }]) >; +// Transform (mul x, 1) -> x +def mul_by_one: GICombineRule < + (defs root:$root), + (match (wip_match_opcode G_MUL):$root, + [{ return Helper.matchConstantOp(${root}->getOperand(2), 1); }]), + (apply [{ return Helper.replaceSingleDefInstWithReg(*${root}, 1); }]) +>; + +// Transform (mul x, -1) -> (sub 0, x) +def mul_by_neg_one_matchinfo : GIDefMatchData<"Register">; +def mul_by_neg_one: GICombineRule < + (defs root:$root, mul_by_neg_one_matchinfo:$matchinfo), + (match (wip_match_opcode G_MUL):$root, + [{ return Helper.matchCombineMulByNegativeOne(*${root}, ${matchinfo}); }]), + (apply [{ return Helper.applyCombineMulByNegativeOne(*${root}, ${matchinfo}); }]) +>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -442,7 +459,7 @@ binop_same_val, binop_left_to_zero, binop_right_to_zero, p2i_to_i2p, i2p_to_p2i, anyext_trunc_fold, - fneg_fneg_fold]>; + fneg_fneg_fold, mul_by_one]>; def known_bits_simplifications : GICombineGroup<[ and_trivial_mask, redundant_sext_inreg]>; @@ -451,7 +468,9 @@ def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp]>; -def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd]>; +def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd, + mul_by_neg_one]>; + def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, combines_for_extload, combine_indexed_load_store, undef_combines, identity_combines, simplify_add_to_sub, Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1947,6 +1947,26 @@ return false; } +bool CombinerHelper::matchCombineMulByNegativeOne(MachineInstr &MI, + Register &Reg) { + assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); + Register DstReg = MI.getOperand(0).getReg(); + int64_t Cst; + return mi_match(DstReg, MRI, m_GMul(m_Reg(Reg), m_ICst(Cst))) && Cst == -1; +} + +bool CombinerHelper::applyCombineMulByNegativeOne(MachineInstr &MI, + Register &Reg) { + assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + + Builder.setInstrAndDebugLoc(MI); + Builder.buildSub(DstReg, Builder.buildConstant(DstTy, 0), Reg, MI.getFlags()); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::matchCombineFNegOfFNeg(MachineInstr &MI, Register &Reg) { assert(MI.getOpcode() == TargetOpcode::G_FNEG && "Expected a G_FNEG"); Register SrcReg = MI.getOperand(1).getReg(); Index: llvm/test/CodeGen/AArch64/GlobalISel/combine-mul.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/GlobalISel/combine-mul.mir @@ -0,0 +1,134 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s + +--- +name: mul_by_zero +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: mul_by_zero + ; CHECK: liveins: $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: $x0 = COPY [[C]](s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 0 + %2:_(s64) = G_MUL %0, %1(s64) + $x0 = COPY %2(s64) +... +--- +name: mul_vector_by_zero +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $q0 + ; Currently not implemented. + ; CHECK-LABEL: name: mul_vector_by_zero + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[COPY]], [[BUILD_VECTOR]] + ; CHECK: $q0 = COPY [[MUL]](<4 x s32>) + %0:_(<4 x s32>) = COPY $q0 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(<4 x s32>) = G_BUILD_VECTOR %1(s32), %1(s32), %1(s32), %1(s32) + %3:_(<4 x s32>) = G_MUL %0, %2(<4 x s32>) + $q0 = COPY %3(<4 x s32>) +... +--- +name: mul_by_one +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: mul_by_one + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: $x0 = COPY [[COPY]](s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 1 + %2:_(s64) = G_MUL %0, %1(s64) + $x0 = COPY %2(s64) +... +--- +name: mul_vector_by_one +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $q0 + ; Currently not implemented. + ; CHECK-LABEL: name: mul_vector_by_one + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[COPY]], [[BUILD_VECTOR]] + ; CHECK: $q0 = COPY [[MUL]](<4 x s32>) + %0:_(<4 x s32>) = COPY $q0 + %1:_(s32) = G_CONSTANT i32 1 + %2:_(<4 x s32>) = G_BUILD_VECTOR %1(s32), %1(s32), %1(s32), %1(s32) + %3:_(<4 x s32>) = G_MUL %0, %2(<4 x s32>) + $q0 = COPY %3(<4 x s32>) +... +--- +name: mul_by_neg_one +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: mul_by_neg_one + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[COPY]] + ; CHECK: $x0 = COPY [[SUB]](s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 -1 + %2:_(s64) = G_MUL %0, %1(s64) + $x0 = COPY %2(s64) +... +--- +name: mul_vector_by_neg_one +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $q0 + ; Currently not implemented. + ; CHECK-LABEL: name: mul_vector_by_neg_one + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[COPY]], [[BUILD_VECTOR]] + ; CHECK: $q0 = COPY [[MUL]](<4 x s32>) + %0:_(<4 x s32>) = COPY $q0 + %1:_(s32) = G_CONSTANT i32 -1 + %2:_(<4 x s32>) = G_BUILD_VECTOR %1(s32), %1(s32), %1(s32), %1(s32) + %3:_(<4 x s32>) = G_MUL %0, %2(<4 x s32>) + $q0 = COPY %3(<4 x s32>) +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -1070,59 +1070,57 @@ ; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 +; CHECK-NEXT: v_mul_lo_u32 v5, s6, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v5, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_mul_lo_u32 v7, v4, v6 ; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v6 +; CHECK-NEXT: v_mul_hi_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v2 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 +; CHECK-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v2 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v8 ; CHECK-NEXT: v_mul_lo_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v2, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 +; CHECK-NEXT: v_mul_hi_u32 v5, v2, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 ; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 +; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v9, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] @@ -1516,62 +1514,60 @@ ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 ; CGP-NEXT: s_movk_i32 s7, 0x1000 -; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v7 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_mul_lo_u32 v11, v8, v10 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v10 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v7 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v13 +; CGP-NEXT: v_sub_i32_e64 v11, s[4:5], v11, v7 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v12 ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 ; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] @@ -1641,62 +1637,60 @@ ; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v7, v7 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v8, s6, v7 +; CGP-NEXT: v_mul_hi_u32 v10, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v4 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v7, v9 ; CGP-NEXT: v_mul_lo_u32 v11, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v9 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v4 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v4 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v11 ; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 +; CGP-NEXT: v_mul_hi_u32 v8, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] @@ -1793,59 +1787,57 @@ ; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 +; CHECK-NEXT: v_mul_lo_u32 v5, s6, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v5, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_mul_lo_u32 v7, v4, v6 ; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v6 +; CHECK-NEXT: v_mul_hi_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v2 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 +; CHECK-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v2 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v8 ; CHECK-NEXT: v_mul_lo_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v2, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 +; CHECK-NEXT: v_mul_hi_u32 v5, v2, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 ; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 +; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v9, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] @@ -2239,62 +2231,60 @@ ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 ; CGP-NEXT: s_mov_b32 s7, 0x12d8fb -; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v7 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_mul_lo_u32 v11, v8, v10 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v10 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v7 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v13 +; CGP-NEXT: v_sub_i32_e64 v11, s[4:5], v11, v7 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v12 ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 ; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] @@ -2364,62 +2354,60 @@ ; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v7, v7 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v8, s6, v7 +; CGP-NEXT: v_mul_hi_u32 v10, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v4 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v7, v9 ; CGP-NEXT: v_mul_lo_u32 v11, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v9 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v4 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v4 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v11 ; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 +; CGP-NEXT: v_mul_hi_u32 v8, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -1050,59 +1050,57 @@ ; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 +; CHECK-NEXT: v_mul_lo_u32 v5, s6, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v5, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_mul_lo_u32 v7, v4, v6 ; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v6 +; CHECK-NEXT: v_mul_hi_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v2 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 +; CHECK-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v2 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v8 ; CHECK-NEXT: v_mul_lo_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v2, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 +; CHECK-NEXT: v_mul_hi_u32 v5, v2, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 ; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 +; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v9, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] @@ -1492,62 +1490,60 @@ ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 ; CGP-NEXT: s_movk_i32 s7, 0x1000 -; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v7 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_mul_lo_u32 v11, v8, v10 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v10 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v7 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v13 +; CGP-NEXT: v_sub_i32_e64 v11, s[4:5], v11, v7 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v12 ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 ; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] @@ -1614,63 +1610,61 @@ ; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v7, v7 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v8, s6, v7 +; CGP-NEXT: v_mul_hi_u32 v10, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v7, v9 ; CGP-NEXT: v_mul_lo_u32 v11, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v9 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v4 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v4 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v11 ; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 +; CGP-NEXT: v_mul_hi_u32 v8, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] @@ -1765,59 +1759,57 @@ ; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 +; CHECK-NEXT: v_mul_lo_u32 v5, s6, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v5, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_mul_lo_u32 v7, v4, v6 ; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v6 +; CHECK-NEXT: v_mul_hi_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v2 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 +; CHECK-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v2 +; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v8 ; CHECK-NEXT: v_mul_lo_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v2, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 +; CHECK-NEXT: v_mul_hi_u32 v5, v2, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 ; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 +; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v9, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 +; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] @@ -2207,62 +2199,60 @@ ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 ; CGP-NEXT: s_mov_b32 s7, 0x12d8fb -; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v7 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_mul_lo_u32 v11, v8, v10 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, -1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v13, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v10 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v7 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v13 +; CGP-NEXT: v_sub_i32_e64 v11, s[4:5], v11, v7 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v12 ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 ; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] @@ -2329,63 +2319,61 @@ ; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v7, v7 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v8, s6, v7 +; CGP-NEXT: v_mul_hi_u32 v10, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v7, v9 ; CGP-NEXT: v_mul_lo_u32 v11, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v4 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v9 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v4 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v4 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v11 ; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 +; CGP-NEXT: v_mul_hi_u32 v8, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -977,27 +977,26 @@ ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v3 ; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, s6, v2 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_mul_lo_u32 v6, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v3, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v7, v2, v4 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v4 ; CHECK-NEXT: v_mul_lo_u32 v9, v3, v4 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -1006,23 +1005,22 @@ ; CHECK-NEXT: v_addc_u32_e64 v5, s[4:5], v3, v4, vcc ; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v4 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 +; CHECK-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v2 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 ; CHECK-NEXT: v_mul_lo_u32 v7, v2, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v6 +; CHECK-NEXT: v_mul_lo_u32 v10, v5, v6 ; CHECK-NEXT: v_mul_hi_u32 v11, v2, v6 ; CHECK-NEXT: v_mul_hi_u32 v5, v5, v6 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v9, v7 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v8, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4 +; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] @@ -1388,117 +1386,113 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_mul_lo_u32 v9, s8, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v13, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, s8, v4 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v12, s8, v5 +; CGP-NEXT: v_mul_hi_u32 v13, s8, v5 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v6, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v16, v4, v8 +; CGP-NEXT: v_mul_lo_u32 v17, v6, v8 ; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 ; CGP-NEXT: v_mul_lo_u32 v19, v5, v9 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v19 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_mul_lo_u32 v13, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v17, v10 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v19, v18 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v15 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc ; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 ; CGP-NEXT: v_mul_lo_u32 v8, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] +; CGP-NEXT: v_mul_hi_u32 v11, s8, v4 +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 +; CGP-NEXT: v_addc_u32_e64 v12, s[6:7], v7, v9, s[4:5] ; CGP-NEXT: v_add_i32_e64 v7, s[6:7], v7, v9 ; CGP-NEXT: v_mul_lo_u32 v9, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v16, s8, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v10, v8 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v13, s8, v5 +; CGP-NEXT: v_mul_lo_u32 v14, s8, v10 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v16, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v10, v8 -; CGP-NEXT: v_mul_lo_u32 v19, s8, v13 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v9 -; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v14, v19 +; CGP-NEXT: v_mul_lo_u32 v17, s8, v12 +; CGP-NEXT: v_mul_lo_u32 v18, v12, v9 ; CGP-NEXT: v_mul_hi_u32 v19, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v14, v15 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 +; CGP-NEXT: v_sub_i32_e64 v14, s[6:7], v14, v4 +; CGP-NEXT: v_sub_i32_e64 v17, s[6:7], v17, v5 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v14, v11 +; CGP-NEXT: v_add_i32_e64 v13, s[6:7], v17, v13 ; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 -; CGP-NEXT: v_mul_lo_u32 v15, v5, v12 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v15, v19 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v11 +; CGP-NEXT: v_mul_lo_u32 v17, v5, v13 +; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v18, v17 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] +; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v17, v19 +; CGP-NEXT: v_mul_lo_u32 v17, v10, v11 ; CGP-NEXT: v_mul_hi_u32 v19, v4, v11 ; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v11, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v17, v14 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v15, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v13 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v13 +; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v15, v14 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] +; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v17, v8 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] ; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v11, v9 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v18 +; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[8:9] ; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v8, v19 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9] ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v17, v14 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v15, v18 -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v19 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 +; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v15, v14 +; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v17, v16 +; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v18, v19 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v13 ; CGP-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] ; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v16 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v15, v12 +; CGP-NEXT: v_add_i32_e64 v13, s[6:7], v15, v13 ; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v14 -; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v13, v11 +; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v12, v11 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc ; CGP-NEXT: v_addc_u32_e64 v7, vcc, v7, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 @@ -1638,27 +1632,26 @@ ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v3 ; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, s6, v2 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_mul_lo_u32 v6, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v3, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v7, v2, v4 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v4 ; CHECK-NEXT: v_mul_lo_u32 v9, v3, v4 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -1667,23 +1660,22 @@ ; CHECK-NEXT: v_addc_u32_e64 v5, s[4:5], v3, v4, vcc ; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v4 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 +; CHECK-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v2 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 ; CHECK-NEXT: v_mul_lo_u32 v7, v2, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v6 +; CHECK-NEXT: v_mul_lo_u32 v10, v5, v6 ; CHECK-NEXT: v_mul_hi_u32 v11, v2, v6 ; CHECK-NEXT: v_mul_hi_u32 v5, v5, v6 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v9, v7 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v8, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4 +; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] @@ -2049,117 +2041,113 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_mul_lo_u32 v9, s8, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v13, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, s8, v4 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v12, s8, v5 +; CGP-NEXT: v_mul_hi_u32 v13, s8, v5 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v6, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v16, v4, v8 +; CGP-NEXT: v_mul_lo_u32 v17, v6, v8 ; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 ; CGP-NEXT: v_mul_lo_u32 v19, v5, v9 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v19 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_mul_lo_u32 v13, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v17, v10 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v19, v18 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v15 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc ; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 ; CGP-NEXT: v_mul_lo_u32 v8, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] +; CGP-NEXT: v_mul_hi_u32 v11, s8, v4 +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 +; CGP-NEXT: v_addc_u32_e64 v12, s[6:7], v7, v9, s[4:5] ; CGP-NEXT: v_add_i32_e64 v7, s[6:7], v7, v9 ; CGP-NEXT: v_mul_lo_u32 v9, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v16, s8, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v10, v8 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v13, s8, v5 +; CGP-NEXT: v_mul_lo_u32 v14, s8, v10 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v16, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v10, v8 -; CGP-NEXT: v_mul_lo_u32 v19, s8, v13 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v9 -; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v14, v19 +; CGP-NEXT: v_mul_lo_u32 v17, s8, v12 +; CGP-NEXT: v_mul_lo_u32 v18, v12, v9 ; CGP-NEXT: v_mul_hi_u32 v19, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v14, v15 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 +; CGP-NEXT: v_sub_i32_e64 v14, s[6:7], v14, v4 +; CGP-NEXT: v_sub_i32_e64 v17, s[6:7], v17, v5 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v14, v11 +; CGP-NEXT: v_add_i32_e64 v13, s[6:7], v17, v13 ; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 -; CGP-NEXT: v_mul_lo_u32 v15, v5, v12 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v15, v19 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v11 +; CGP-NEXT: v_mul_lo_u32 v17, v5, v13 +; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v18, v17 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] +; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v17, v19 +; CGP-NEXT: v_mul_lo_u32 v17, v10, v11 ; CGP-NEXT: v_mul_hi_u32 v19, v4, v11 ; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v11, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v17, v14 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v15, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v13 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v13 +; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v15, v14 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] +; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v17, v8 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] ; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v11, v9 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v18 +; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[8:9] ; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v8, v19 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9] ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v17, v14 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v15, v18 -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v19 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 +; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v15, v14 +; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v17, v16 +; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v18, v19 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v13 ; CGP-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] ; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v16 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v15, v12 +; CGP-NEXT: v_add_i32_e64 v13, s[6:7], v15, v13 ; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v14 -; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v13, v11 +; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v12, v11 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc ; CGP-NEXT: v_addc_u32_e64 v7, vcc, v7, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -963,27 +963,26 @@ ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v3 ; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, s6, v2 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_mul_lo_u32 v6, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v3, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v7, v2, v4 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v4 ; CHECK-NEXT: v_mul_lo_u32 v9, v3, v4 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -992,23 +991,22 @@ ; CHECK-NEXT: v_addc_u32_e64 v5, s[4:5], v3, v4, vcc ; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v4 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 +; CHECK-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v2 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 ; CHECK-NEXT: v_mul_lo_u32 v7, v2, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v6 +; CHECK-NEXT: v_mul_lo_u32 v10, v5, v6 ; CHECK-NEXT: v_mul_hi_u32 v11, v2, v6 ; CHECK-NEXT: v_mul_hi_u32 v5, v5, v6 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v9, v7 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v8, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4 +; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] @@ -1368,117 +1366,113 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_mul_lo_u32 v9, s8, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v13, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, s8, v4 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v12, s8, v5 +; CGP-NEXT: v_mul_hi_u32 v13, s8, v5 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v6, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v16, v4, v8 +; CGP-NEXT: v_mul_lo_u32 v17, v6, v8 ; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 ; CGP-NEXT: v_mul_lo_u32 v19, v5, v9 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v19 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_mul_lo_u32 v13, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v17, v10 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v19, v18 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v15 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc ; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 ; CGP-NEXT: v_mul_lo_u32 v8, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] +; CGP-NEXT: v_mul_hi_u32 v11, s8, v4 +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 +; CGP-NEXT: v_addc_u32_e64 v12, s[6:7], v7, v9, s[4:5] ; CGP-NEXT: v_add_i32_e64 v7, s[6:7], v7, v9 ; CGP-NEXT: v_mul_lo_u32 v9, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v16, s8, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v10, v8 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v13, s8, v5 +; CGP-NEXT: v_mul_lo_u32 v14, s8, v10 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v16, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v10, v8 -; CGP-NEXT: v_mul_lo_u32 v19, s8, v13 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v9 -; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v14, v19 +; CGP-NEXT: v_mul_lo_u32 v17, s8, v12 +; CGP-NEXT: v_mul_lo_u32 v18, v12, v9 ; CGP-NEXT: v_mul_hi_u32 v19, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v14, v15 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 +; CGP-NEXT: v_sub_i32_e64 v14, s[6:7], v14, v4 +; CGP-NEXT: v_sub_i32_e64 v17, s[6:7], v17, v5 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v14, v11 +; CGP-NEXT: v_add_i32_e64 v13, s[6:7], v17, v13 ; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 -; CGP-NEXT: v_mul_lo_u32 v15, v5, v12 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v15, v19 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v11 +; CGP-NEXT: v_mul_lo_u32 v17, v5, v13 +; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v18, v17 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] +; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v17, v19 +; CGP-NEXT: v_mul_lo_u32 v17, v10, v11 ; CGP-NEXT: v_mul_hi_u32 v19, v4, v11 ; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v11, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v17, v14 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v15, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v13 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v13 +; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v15, v14 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] +; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v17, v8 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] ; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v11, v9 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v18 +; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[8:9] ; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v8, v19 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9] ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v17, v14 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v15, v18 -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v19 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 +; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v15, v14 +; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v17, v16 +; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v18, v19 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v13 ; CGP-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] ; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v16 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v15, v12 +; CGP-NEXT: v_add_i32_e64 v13, s[6:7], v15, v13 ; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v14 -; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v13, v11 +; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v12, v11 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc ; CGP-NEXT: v_addc_u32_e64 v7, vcc, v7, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 @@ -1614,27 +1608,26 @@ ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v3 ; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, s6, v2 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_mul_lo_u32 v6, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v3, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v7, v2, v4 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v4 ; CHECK-NEXT: v_mul_lo_u32 v9, v3, v4 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -1643,23 +1636,22 @@ ; CHECK-NEXT: v_addc_u32_e64 v5, s[4:5], v3, v4, vcc ; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v4 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 +; CHECK-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v2 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 ; CHECK-NEXT: v_mul_lo_u32 v7, v2, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v6 +; CHECK-NEXT: v_mul_lo_u32 v10, v5, v6 ; CHECK-NEXT: v_mul_hi_u32 v11, v2, v6 ; CHECK-NEXT: v_mul_hi_u32 v5, v5, v6 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v9, v7 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v8, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4 +; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] @@ -2019,117 +2011,113 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_mul_lo_u32 v9, s8, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v13, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, s8, v4 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v12, s8, v5 +; CGP-NEXT: v_mul_hi_u32 v13, s8, v5 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v6, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v16, v4, v8 +; CGP-NEXT: v_mul_lo_u32 v17, v6, v8 ; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 ; CGP-NEXT: v_mul_lo_u32 v19, v5, v9 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v19 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_mul_lo_u32 v13, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v17, v10 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v19, v18 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v15 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc ; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 ; CGP-NEXT: v_mul_lo_u32 v8, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] +; CGP-NEXT: v_mul_hi_u32 v11, s8, v4 +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 +; CGP-NEXT: v_addc_u32_e64 v12, s[6:7], v7, v9, s[4:5] ; CGP-NEXT: v_add_i32_e64 v7, s[6:7], v7, v9 ; CGP-NEXT: v_mul_lo_u32 v9, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v16, s8, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v10, v8 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v13, s8, v5 +; CGP-NEXT: v_mul_lo_u32 v14, s8, v10 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v16, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v10, v8 -; CGP-NEXT: v_mul_lo_u32 v19, s8, v13 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v9 -; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v14, v19 +; CGP-NEXT: v_mul_lo_u32 v17, s8, v12 +; CGP-NEXT: v_mul_lo_u32 v18, v12, v9 ; CGP-NEXT: v_mul_hi_u32 v19, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v14, v15 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 +; CGP-NEXT: v_sub_i32_e64 v14, s[6:7], v14, v4 +; CGP-NEXT: v_sub_i32_e64 v17, s[6:7], v17, v5 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v14, v11 +; CGP-NEXT: v_add_i32_e64 v13, s[6:7], v17, v13 ; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 -; CGP-NEXT: v_mul_lo_u32 v15, v5, v12 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v15, v19 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v11 +; CGP-NEXT: v_mul_lo_u32 v17, v5, v13 +; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v18, v17 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] +; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v17, v19 +; CGP-NEXT: v_mul_lo_u32 v17, v10, v11 ; CGP-NEXT: v_mul_hi_u32 v19, v4, v11 ; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v11, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v17, v14 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v15, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v13 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v13 +; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v15, v14 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] +; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v17, v8 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] ; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v11, v9 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v18 +; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[8:9] ; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v8, v19 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9] ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v17, v14 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v15, v18 -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v19 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 +; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v15, v14 +; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v17, v16 +; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v18, v19 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v13 ; CGP-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] ; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v16 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v15, v12 +; CGP-NEXT: v_add_i32_e64 v13, s[6:7], v15, v13 ; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v14 -; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v13, v11 +; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v13 +; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v12, v11 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc ; CGP-NEXT: v_addc_u32_e64 v7, vcc, v7, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8