Index: llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -318,6 +318,19 @@ bool applyCombineTruncOfShl(MachineInstr &MI, std::pair &MatchInfo); + /// Transform G_ADD(x, -cst) to G_SUB(x, cst). + bool matchAddNegConstant(MachineInstr &MI, int64_t &Cst); + bool applyAddNegConstant(MachineInstr &MI, int64_t &Cst); + + /// Transform G_ADD(x, G_SUB(y, x)) to y. + /// Transform G_ADD(G_SUB(y, x), x) to y. + bool matchAddSubSameReg(MachineInstr &MI, Register &Src); + bool applyAddSubSameReg(MachineInstr &MI, Register &Src); + + /// Transform G_ADD(x, y) to G_OR(x, y) iff x and y share no common bits. + bool matchAddToOr(MachineInstr &MI); + bool applyAddToOr(MachineInstr &MI); + /// Transform G_MUL(x, -1) to G_SUB(0, x) bool applyCombineMulByNegativeOne(MachineInstr &MI); Index: llvm/include/llvm/CodeGen/GlobalISel/Utils.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -23,6 +23,7 @@ namespace llvm { class AnalysisUsage; +class GISelKnownBits; class MachineFunction; class MachineInstr; class MachineOperand; @@ -249,5 +250,10 @@ /// Returns an integer representing true, as defined by the /// TargetBooleanContents. int64_t getICmpTrueVal(const TargetLowering &TLI, bool IsVector, bool IsFP); + +/// Returns true if \p A and \p B are known to have no common bits sets. +bool haveNoCommonBitsSet(const Register A, const Register B, GISelKnownBits &KB, + MachineRegisterInfo &MRI); + } // End namespace llvm. #endif Index: llvm/include/llvm/Target/GlobalISel/Combine.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/Combine.td +++ llvm/include/llvm/Target/GlobalISel/Combine.td @@ -480,6 +480,33 @@ (apply [{ return Helper.applyCombineMulByNegativeOne(*${root}); }]) >; +// Transform (add x, -cst) -> (sub x, cst) +def add_neg_constant_matchinfo : GIDefMatchData<"int64_t">; +def add_neg_constant: GICombineRule< + (defs root:$root, add_neg_constant_matchinfo:$matchinfo), + (match (wip_match_opcode G_ADD):$root, + [{ return Helper.matchAddNegConstant(*${root}, ${matchinfo}); }]), + (apply [{ return Helper.applyAddNegConstant(*${root}, ${matchinfo}); }]) +>; + +// Transform (add x, (sub y, x)) -> y +// Transform (add (sub y, x), x) -> y +def add_sub_reg_matchinfo : GIDefMatchData<"Register">; +def add_sub_reg: GICombineRule < + (defs root:$root, add_sub_reg_matchinfo:$matchinfo), + (match (wip_match_opcode G_ADD):$root, + [{ return Helper.matchAddSubSameReg(*${root}, ${matchinfo}); }]), + (apply [{ return Helper.applyAddSubSameReg(*${root}, ${matchinfo}); }]) +>; + +// Transform (add x, y) -> or(x, y) iff x and y share no common bits +def add_to_or: GICombineRule < + (defs root:$root), + (match (wip_match_opcode G_ADD):$root, + [{ return Helper.matchAddToOr(*${root}); }]), + (apply [{ return Helper.applyAddToOr(*${root}); }]) +>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -496,14 +523,14 @@ fneg_fneg_fold, right_identity_one]>; def known_bits_simplifications : GICombineGroup<[ - and_trivial_mask, redundant_sext_inreg]>; + and_trivial_mask, redundant_sext_inreg, add_to_or]>; def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>; def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp]>; def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd, - mul_by_neg_one]>; + mul_by_neg_one, add_neg_constant]>; def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, combines_for_extload, combine_indexed_load_store, undef_combines, @@ -515,4 +542,4 @@ not_cmp_fold, opt_brcond_by_inverting_cond, unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc, unmerge_zext_to_zext, trunc_ext_fold, trunc_shl, - constant_fp_op]>; + constant_fp_op, add_sub_reg]>; Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2183,6 +2183,73 @@ return true; } +bool CombinerHelper::matchAddNegConstant(MachineInstr &MI, int64_t &Cst) { + assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD"); + auto ValAndVReg = + getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + if (ValAndVReg && ValAndVReg->Value < 0) { + Cst = ValAndVReg->Value; + return true; + } + return false; +} + +bool CombinerHelper::applyAddNegConstant(MachineInstr &MI, int64_t &Cst) { + assert(MI.getOpcode() == TargetOpcode::G_ADD && Cst < 0 && + "Expected a G_ADD and negative constant"); + Register Dst = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(1).getReg(); + + Builder.setInstrAndDebugLoc(MI); + Builder.buildSub(Dst, Reg, Builder.buildConstant(MRI.getType(Reg), -Cst), + MI.getFlags()); + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::matchAddSubSameReg(MachineInstr &MI, Register &Src) { + assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD"); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + + // Helper lambda to check for opportunities for + // A + (B - A) -> B + // (B - A) + A -> B + auto CheckFold = [&](Register &MaybeSub, Register &MaybeSameReg) { + Register Reg; + return mi_match(MaybeSub, MRI, m_GSub(m_Reg(Src), m_Reg(Reg))) && + Reg == MaybeSameReg; + }; + return CheckFold(LHS, RHS) || CheckFold(RHS, LHS); +} + +bool CombinerHelper::applyAddSubSameReg(MachineInstr &MI, Register &Src) { + assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD"); + Register Dst = MI.getOperand(0).getReg(); + MI.eraseFromParent(); + replaceRegWith(MRI, Dst, Src); + return true; +} + +bool CombinerHelper::matchAddToOr(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD"); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + return haveNoCommonBitsSet(LHS, RHS, *KB, MRI); +} + +bool CombinerHelper::applyAddToOr(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD"); + Register Dst = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + + Builder.setInstrAndDebugLoc(MI); + Builder.buildOr(Dst, LHS, RHS); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::matchAnyExplicitUseIsUndef(MachineInstr &MI) { return any_of(MI.explicit_uses(), [this](const MachineOperand &MO) { return MO.isReg() && Index: llvm/lib/CodeGen/GlobalISel/Utils.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/Optional.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/MachineInstr.h" @@ -754,3 +755,11 @@ } llvm_unreachable("Invalid boolean contents"); } + +bool llvm::haveNoCommonBitsSet(const Register A, const Register B, + GISelKnownBits &KB, MachineRegisterInfo &MRI) { + assert(MRI.getType(B) == MRI.getType(A) && "Invalid operands"); + KnownBits AKnown = KB.getKnownBits(A); + KnownBits BKnown = KB.getKnownBits(B); + return (AKnown.Zero | BKnown.Zero).isAllOnesValue(); +} Index: llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir @@ -0,0 +1,85 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s + +--- +name: add_neg_constant +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $w0 + ; CHECK-LABEL: name: add_neg_constant + ; CHECK: liveins: $w0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[C]] + ; CHECK: $w0 = COPY [[SUB]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 -1 + %2:_(s32) = G_ADD %0, %1(s32) + $w0 = COPY %2(s32) +... +--- +name: add_lhs_sub_reg +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: add_lhs_sub_reg + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: $w0 = COPY [[COPY]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %2:_(s32) = G_SUB %0, %1 + %3:_(s32) = G_ADD %2, %1(s32) + $w0 = COPY %3(s32) +... +--- +name: add_rhs_sub_reg +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: add_rhs_sub_reg + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: $w0 = COPY [[COPY]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %2:_(s32) = G_SUB %0, %1 + %3:_(s32) = G_ADD %1, %2(s32) + $w0 = COPY %3(s32) +... +--- +name: add_to_or +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: add_to_or + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LSHR]] + ; CHECK: $w0 = COPY [[OR]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %2:_(s32) = G_CONSTANT i32 16 + %3:_(s32) = G_SHL %0, %2 + %4:_(s32) = G_LSHR %1, %2 + %5:_(s32) = G_ADD %3, %4 + $w0 = COPY %5 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -98,10 +98,9 @@ ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0xffc0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_add_u16_e32 v1, s4, v0 -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_subrev_u16_e32 v1, 64, v0 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i16> %a, @@ -120,7 +119,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 4 -; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0 +; GFX8-NEXT: v_subrev_u16_e32 v1, 64, v0 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -139,10 +138,10 @@ ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0 -; GFX8-NEXT: v_add_u16_e32 v2, 4, v0 -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_add_u16_e32 v1, 4, v0 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i16> %a, ret <2 x i16> %add Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -7,29 +7,36 @@ ; GFX9-LABEL: extractelement_sgpr_v4i128_sgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 -; GFX9-NEXT: s_lshl_b32 m0, s4, 1 +; GFX9-NEXT: s_lshl_b32 s2, s4, 1 +; GFX9-NEXT: s_mov_b32 m0, s2 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GFX9-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GFX9-NEXT: s_or_b32 m0, s2, 1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_movrels_b64 s[2:3], s[8:9] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_sgpr_v4i128_sgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 -; GFX8-NEXT: s_lshl_b32 m0, s4, 1 +; GFX8-NEXT: s_lshl_b32 s2, s4, 1 +; GFX8-NEXT: s_mov_b32 m0, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GFX8-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GFX8-NEXT: s_or_b32 m0, s2, 1 +; GFX8-NEXT: s_movrels_b64 s[2:3], s[8:9] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: extractelement_sgpr_v4i128_sgpr_idx: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 -; GFX7-NEXT: s_lshl_b32 m0, s4, 1 +; GFX7-NEXT: s_lshl_b32 s2, s4, 1 +; GFX7-NEXT: s_mov_b32 m0, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GFX7-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GFX7-NEXT: s_or_b32 m0, s2, 1 +; GFX7-NEXT: s_movrels_b64 s[2:3], s[8:9] ; GFX7-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx @@ -44,11 +51,15 @@ ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 ; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 ; GFX9-NEXT: s_lshl_b32 s0, s2, 1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-NEXT: s_lshl_b32 s1, s0, 1 +; GFX9-NEXT: s_set_gpr_idx_on s1, gpr_idx(SRC0) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: s_or_b32 s0, s0, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) ; GFX9-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, v3 ; GFX9-NEXT: s_set_gpr_idx_off @@ -72,14 +83,16 @@ ; GFX8-NEXT: flat_load_dwordx4 v[14:17], v[0:1] ; GFX8-NEXT: s_lshl_b32 s0, s2, 1 ; GFX8-NEXT: s_lshl_b32 m0, s0, 1 +; GFX8-NEXT: s_or_b32 s0, s0, 1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_movrels_b32_e32 v1, v3 ; GFX8-NEXT: v_movrels_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_lshl_b32 m0, s0, 1 +; GFX8-NEXT: v_movrels_b32_e32 v18, v2 +; GFX8-NEXT: v_movrels_b32_e32 v3, v3 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s2, v18 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; @@ -94,14 +107,16 @@ ; GFX7-NEXT: buffer_load_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX7-NEXT: s_lshl_b32 s0, s2, 1 ; GFX7-NEXT: s_lshl_b32 m0, s0, 1 +; GFX7-NEXT: s_or_b32 s0, s0, 1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_movrels_b32_e32 v1, v3 ; GFX7-NEXT: v_movrels_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_lshl_b32 m0, s0, 1 +; GFX7-NEXT: v_movrels_b32_e32 v18, v2 +; GFX7-NEXT: v_movrels_b32_e32 v3, v3 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s2, v18 ; GFX7-NEXT: v_readfirstlane_b32 s3, v3 ; GFX7-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr @@ -116,7 +131,7 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 1, v2 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX9-NEXT: v_add_u32_e32 v17, 1, v16 +; GFX9-NEXT: v_or_b32_e32 v17, 1, v16 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 @@ -175,7 +190,7 @@ ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[3:4] ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 1, v2 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 1, v16 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 @@ -239,7 +254,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 1, v2 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[8:11], 0 addr64 offset:16 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 1, v16 +; GFX7-NEXT: v_or_b32_e32 v17, 1, v16 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 @@ -295,260 +310,90 @@ } define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace(4)* inreg %ptr, i32 %idx) { -; GFX9-LABEL: extractelement_sgpr_v4i128_vgpr_idx: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GFX9-NEXT: v_mov_b32_e32 v8, s7 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX9-NEXT: v_mov_b32_e32 v9, s8 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GFX9-NEXT: v_mov_b32_e32 v10, s9 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GFX9-NEXT: v_mov_b32_e32 v11, s10 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; GFX9-NEXT: v_mov_b32_e32 v12, s11 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc -; GFX9-NEXT: v_mov_b32_e32 v13, s12 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; GFX9-NEXT: v_mov_b32_e32 v14, s13 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v14, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; GFX9-NEXT: v_mov_b32_e32 v15, s14 -; GFX9-NEXT: v_mov_b32_e32 v16, s15 -; GFX9-NEXT: v_add_u32_e32 v2, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v16, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_mov_b32_e32 v7, s4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GFX9-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX9-NEXT: v_mov_b32_e32 v9, s6 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GFX9-NEXT: v_mov_b32_e32 v10, s7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX9-NEXT: v_mov_b32_e32 v11, s8 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GFX9-NEXT: v_mov_b32_e32 v12, s9 -; GFX9-NEXT: v_mov_b32_e32 v13, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; GFX9-NEXT: v_mov_b32_e32 v6, s12 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; GFX9-NEXT: v_mov_b32_e32 v8, s14 -; GFX9-NEXT: v_mov_b32_e32 v9, s15 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s3, v3 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: extractelement_sgpr_v4i128_vgpr_idx: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: v_mov_b32_e32 v6, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, s7 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX8-NEXT: v_mov_b32_e32 v9, s8 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, s9 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GFX8-NEXT: v_mov_b32_e32 v11, s10 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; GFX8-NEXT: v_mov_b32_e32 v12, s11 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc -; GFX8-NEXT: v_mov_b32_e32 v13, s12 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; GFX8-NEXT: v_mov_b32_e32 v14, s13 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v14, vcc -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v16, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v5, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_mov_b32_e32 v7, s4 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX8-NEXT: v_mov_b32_e32 v9, s6 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, s7 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX8-NEXT: v_mov_b32_e32 v11, s8 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GFX8-NEXT: v_mov_b32_e32 v12, s9 -; GFX8-NEXT: v_mov_b32_e32 v13, s10 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; GFX8-NEXT: v_mov_b32_e32 v6, s12 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, s14 -; GFX8-NEXT: v_mov_b32_e32 v9, s15 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s2, v2 -; GFX8-NEXT: v_readfirstlane_b32 s3, v3 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX7-LABEL: extractelement_sgpr_v4i128_vgpr_idx: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s3 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: v_mov_b32_e32 v6, s5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, s7 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX7-NEXT: v_mov_b32_e32 v9, s8 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GFX7-NEXT: v_mov_b32_e32 v10, s9 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GFX7-NEXT: v_mov_b32_e32 v11, s10 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; GFX7-NEXT: v_mov_b32_e32 v12, s11 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc -; GFX7-NEXT: v_mov_b32_e32 v13, s12 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; GFX7-NEXT: v_mov_b32_e32 v14, s13 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v14, vcc -; GFX7-NEXT: v_mov_b32_e32 v15, s14 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; GFX7-NEXT: v_mov_b32_e32 v16, s15 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v16, vcc -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: v_mov_b32_e32 v5, s2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX7-NEXT: v_mov_b32_e32 v7, s4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, s5 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX7-NEXT: v_mov_b32_e32 v9, s6 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GFX7-NEXT: v_mov_b32_e32 v10, s7 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX7-NEXT: v_mov_b32_e32 v11, s8 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GFX7-NEXT: v_mov_b32_e32 v12, s9 -; GFX7-NEXT: v_mov_b32_e32 v13, s10 -; GFX7-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, s12 -; GFX7-NEXT: v_mov_b32_e32 v7, s13 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, s14 -; GFX7-NEXT: v_mov_b32_e32 v9, s15 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7-NEXT: v_readfirstlane_b32 s2, v2 -; GFX7-NEXT: v_readfirstlane_b32 s3, v3 -; GFX7-NEXT: ; return to shader part epilog +; GCN-LABEL: extractelement_sgpr_v4i128_vgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: v_mov_b32_e32 v6, s5 +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GCN-NEXT: v_mov_b32_e32 v8, s7 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GCN-NEXT: v_mov_b32_e32 v9, s8 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GCN-NEXT: v_mov_b32_e32 v10, s9 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GCN-NEXT: v_mov_b32_e32 v11, s10 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GCN-NEXT: v_mov_b32_e32 v12, s11 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GCN-NEXT: v_mov_b32_e32 v13, s12 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GCN-NEXT: v_mov_b32_e32 v14, s13 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v14, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GCN-NEXT: v_mov_b32_e32 v15, s14 +; GCN-NEXT: v_mov_b32_e32 v16, s15 +; GCN-NEXT: v_or_b32_e32 v2, 1, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v16, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s1 +; GCN-NEXT: v_mov_b32_e32 v5, s2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GCN-NEXT: v_mov_b32_e32 v6, s3 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: v_mov_b32_e32 v7, s4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GCN-NEXT: v_mov_b32_e32 v8, s5 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GCN-NEXT: v_mov_b32_e32 v9, s6 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GCN-NEXT: v_mov_b32_e32 v10, s7 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GCN-NEXT: v_mov_b32_e32 v11, s8 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GCN-NEXT: v_mov_b32_e32 v12, s9 +; GCN-NEXT: v_mov_b32_e32 v13, s10 +; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GCN-NEXT: v_mov_b32_e32 v6, s12 +; GCN-NEXT: v_mov_b32_e32 v7, s13 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GCN-NEXT: v_mov_b32_e32 v8, s14 +; GCN-NEXT: v_mov_b32_e32 v9, s15 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-NEXT: v_readfirstlane_b32 s3, v3 +; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx ret i128 %element Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -777,7 +777,7 @@ ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s3 -; GCN-NEXT: s_add_i32 m0, s18, -1 +; GCN-NEXT: s_sub_i32 m0, s18, 1 ; GCN-NEXT: s_mov_b32 s2, s4 ; GCN-NEXT: s_mov_b32 s3, s5 ; GCN-NEXT: s_mov_b32 s4, s6 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -1023,7 +1023,7 @@ ; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 24 ; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xff800000, v0 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, 0x800000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1082,7 +1082,7 @@ ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_xor_b32 s0, s1, s0 ; GFX8-NEXT: s_ashr_i32 s1, s3, 23 -; GFX8-NEXT: s_add_i32 s1, s1, 0xff800000 +; GFX8-NEXT: s_sub_i32 s1, s1, 0x800000 ; GFX8-NEXT: s_and_b32 s0, s0, 1 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_cselect_b32 s0, s1, s2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -56,11 +56,11 @@ ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v5, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_or_b32_e32 v3, v5, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_or_b32_e32 v2, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 ; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -132,11 +132,11 @@ ; CGP-NEXT: v_mul_lo_u32 v1, s1, v0 ; CGP-NEXT: v_mul_lo_u32 v2, 0, v1 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CGP-NEXT: v_or_b32_e32 v1, v2, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_lo_u32 v1, 0, v0 ; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_or_b32_e32 v0, v1, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v0, s2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 @@ -251,16 +251,16 @@ ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: v_mul_lo_u32 v11, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; CGP-NEXT: v_or_b32_e32 v5, v10, v5 +; CGP-NEXT: v_or_b32_e32 v7, v11, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_or_b32_e32 v4, v6, v4 +; CGP-NEXT: v_or_b32_e32 v5, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_mul_lo_u32 v10, v5, v3 @@ -329,7 +329,7 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: s_add_i32 s8, 0x1000, 0 +; GISEL-NEXT: s_movk_i32 s8, 0x1000 ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8 @@ -473,7 +473,7 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: s_add_i32 s8, 0x12d8fb, 0 +; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8 @@ -718,16 +718,16 @@ ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_or_b32_e32 v7, v10, v7 +; CGP-NEXT: v_or_b32_e32 v9, v11, v9 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v8, 0, v6 ; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_or_b32_e32 v6, v8, v6 +; CGP-NEXT: v_or_b32_e32 v7, v9, v7 ; CGP-NEXT: v_mul_lo_u32 v8, v6, v2 ; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v6 ; CGP-NEXT: v_mul_lo_u32 v10, v7, v3 @@ -767,8 +767,8 @@ ; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v3 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1 @@ -809,11 +809,11 @@ ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_or_b32_e32 v3, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_or_b32_e32 v2, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -922,16 +922,16 @@ ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_or_b32_e32 v5, v8, v5 +; CGP-NEXT: v_or_b32_e32 v7, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_or_b32_e32 v4, v6, v4 +; CGP-NEXT: v_or_b32_e32 v5, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -1157,11 +1157,11 @@ ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_mul_lo_u32 v5, 0, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v2 +; CHECK-NEXT: v_or_b32_e32 v5, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 ; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v5, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5 ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v6 @@ -1607,11 +1607,11 @@ ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 +; CGP-NEXT: v_mul_hi_u32 v11, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v12, s7, v7 +; CGP-NEXT: v_or_b32_e32 v9, v9, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 ; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v9, vcc ; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v9 ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v10 @@ -1656,7 +1656,7 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -1687,7 +1687,7 @@ ; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v8, v4, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 @@ -1708,51 +1708,51 @@ ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_mul_hi_u32 v5, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v3, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v5 -; CGP-NEXT: v_mul_hi_u32 v10, s7, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v4 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v7, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v7 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s7, v7 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: v_mul_hi_u32 v5, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s7, v4 +; CGP-NEXT: v_or_b32_e32 v8, v8, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v5 ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 ; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s7, v2 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 @@ -1764,9 +1764,9 @@ ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 @@ -1880,11 +1880,11 @@ ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_mul_lo_u32 v5, 0, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, s6, v2 +; CHECK-NEXT: v_or_b32_e32 v5, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 ; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v5, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5 ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v6 @@ -2330,11 +2330,11 @@ ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v7 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 +; CGP-NEXT: v_mul_hi_u32 v11, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v12, s7, v7 +; CGP-NEXT: v_or_b32_e32 v9, v9, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 ; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v9, vcc ; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v9 ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v10 @@ -2379,7 +2379,7 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -2410,7 +2410,7 @@ ; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v8, v4, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 @@ -2431,51 +2431,51 @@ ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_mul_hi_u32 v5, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v3, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v5 -; CGP-NEXT: v_mul_hi_u32 v10, s7, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v4 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v7, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v7 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s7, v7 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: v_mul_hi_u32 v5, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s7, v4 +; CGP-NEXT: v_or_b32_e32 v8, v8, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v5 ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 ; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s7, v2 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 @@ -2487,9 +2487,9 @@ ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -52,11 +52,11 @@ ; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 ; CGP-NEXT: v_mul_lo_u32 v5, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_or_b32_e32 v4, v5, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_or_b32_e32 v3, v4, v3 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v1 @@ -122,11 +122,11 @@ ; CGP-NEXT: v_mul_lo_u32 v1, s3, v0 ; CGP-NEXT: v_mul_lo_u32 v2, 0, v1 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CGP-NEXT: v_or_b32_e32 v1, v2, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_lo_u32 v1, 0, v0 ; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_or_b32_e32 v0, v1, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, s1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; CGP-NEXT: v_subrev_i32_e32 v1, vcc, s1, v0 @@ -231,16 +231,16 @@ ; CGP-NEXT: v_mul_hi_u32 v7, v5, v7 ; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_or_b32_e32 v7, v10, v7 +; CGP-NEXT: v_or_b32_e32 v9, v11, v9 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v8, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v0, v5 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_or_b32_e32 v5, v8, v5 +; CGP-NEXT: v_or_b32_e32 v7, v9, v7 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v2 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 @@ -303,7 +303,7 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: s_add_i32 s4, 0x1000, 0 +; GISEL-NEXT: s_movk_i32 s4, 0x1000 ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s4 @@ -437,7 +437,7 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: s_add_i32 s4, 0x12d8fb, 0 +; GISEL-NEXT: s_mov_b32 s4, 0x12d8fb ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s4 @@ -663,16 +663,16 @@ ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_or_b32_e32 v7, v10, v7 +; CGP-NEXT: v_or_b32_e32 v9, v11, v9 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v8, 0, v6 ; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_or_b32_e32 v6, v8, v6 +; CGP-NEXT: v_or_b32_e32 v7, v9, v7 ; CGP-NEXT: v_mul_lo_u32 v6, v6, v2 ; CGP-NEXT: v_mul_lo_u32 v7, v7, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 @@ -708,8 +708,8 @@ ; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v3 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 @@ -747,11 +747,11 @@ ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_or_b32_e32 v3, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_or_b32_e32 v2, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 @@ -852,16 +852,16 @@ ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_or_b32_e32 v5, v8, v5 +; CGP-NEXT: v_or_b32_e32 v7, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_or_b32_e32 v4, v6, v4 +; CGP-NEXT: v_or_b32_e32 v5, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -1137,20 +1137,20 @@ ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_mul_lo_u32 v5, 0, v2 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v4 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; CHECK-NEXT: v_mul_hi_u32 v6, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v2, s6, v2 +; CHECK-NEXT: v_or_b32_e32 v4, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v1, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] ; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 @@ -1164,9 +1164,9 @@ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -1583,20 +1583,20 @@ ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_lo_u32 v8, s7, v8 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v7 -; CGP-NEXT: v_mul_hi_u32 v7, s7, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc -; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v8 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_mul_hi_u32 v10, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v7, s7, v7 +; CGP-NEXT: v_or_b32_e32 v8, v9, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v7 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] ; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s7, v0 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 @@ -1610,19 +1610,19 @@ ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CGP-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v7, v7 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 ; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 ; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1630,7 +1630,7 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -1661,7 +1661,7 @@ ; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v8, v4, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 @@ -1682,47 +1682,47 @@ ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_mul_hi_u32 v5, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v3, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v5, s7, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v4 -; CGP-NEXT: v_mul_hi_u32 v4, s7, v4 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v4 +; CGP-NEXT: v_mul_lo_u32 v7, s7, v7 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: v_mul_hi_u32 v5, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, s7, v4 +; CGP-NEXT: v_or_b32_e32 v7, v8, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v3, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v5 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v4 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] ; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s7, v2 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v3 @@ -1736,9 +1736,9 @@ ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 @@ -1852,20 +1852,20 @@ ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_mul_lo_u32 v5, 0, v2 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v4 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; CHECK-NEXT: v_mul_hi_u32 v6, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v2, s6, v2 +; CHECK-NEXT: v_or_b32_e32 v4, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v1, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] ; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 @@ -1879,9 +1879,9 @@ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -2298,20 +2298,20 @@ ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_lo_u32 v8, s7, v8 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v7 -; CGP-NEXT: v_mul_hi_u32 v7, s7, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc -; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v8 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_mul_hi_u32 v10, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v7, s7, v7 +; CGP-NEXT: v_or_b32_e32 v8, v9, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v7 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] ; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s7, v0 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 @@ -2325,19 +2325,19 @@ ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CGP-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v7, v7 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 ; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 ; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -2345,7 +2345,7 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -2376,7 +2376,7 @@ ; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v8, v4, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 -; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 @@ -2397,47 +2397,47 @@ ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_mul_hi_u32 v5, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v3, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v7, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v5, s7, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v4 -; CGP-NEXT: v_mul_hi_u32 v4, s7, v4 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_mul_lo_u32 v8, 0, v4 +; CGP-NEXT: v_mul_lo_u32 v7, s7, v7 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: v_mul_hi_u32 v5, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, s7, v4 +; CGP-NEXT: v_or_b32_e32 v7, v8, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v3, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v5 +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v4 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] ; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s7, v2 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v3 @@ -2451,9 +2451,9 @@ ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -1023,7 +1023,7 @@ ; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 24 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xff800000, v0 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, 0x800000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1082,7 +1082,7 @@ ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_xor_b32 s0, s1, s0 ; GFX8-NEXT: s_ashr_i32 s1, s3, 23 -; GFX8-NEXT: s_add_i32 s1, s1, 0xff800000 +; GFX8-NEXT: s_sub_i32 s1, s1, 0x800000 ; GFX8-NEXT: s_and_b32 s0, s0, 1 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_cselect_b32 s0, s1, s2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -40,11 +40,11 @@ ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_or_b32_e32 v3, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_or_b32_e32 v2, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -98,11 +98,11 @@ ; CGP-NEXT: v_mul_lo_u32 v1, s2, v0 ; CGP-NEXT: v_mul_lo_u32 v2, 0, v1 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CGP-NEXT: v_or_b32_e32 v1, v2, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_lo_u32 v1, 0, v0 ; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_or_b32_e32 v0, v1, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v0, s1 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 @@ -183,16 +183,16 @@ ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_or_b32_e32 v5, v8, v5 +; CGP-NEXT: v_or_b32_e32 v7, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_or_b32_e32 v4, v6, v4 +; CGP-NEXT: v_or_b32_e32 v5, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 @@ -554,16 +554,16 @@ ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_or_b32_e32 v5, v8, v5 +; CGP-NEXT: v_or_b32_e32 v7, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_or_b32_e32 v4, v6, v4 +; CGP-NEXT: v_or_b32_e32 v5, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 @@ -632,11 +632,11 @@ ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_or_b32_e32 v3, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_or_b32_e32 v2, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -727,16 +727,16 @@ ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_or_b32_e32 v5, v8, v5 +; CGP-NEXT: v_or_b32_e32 v7, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_or_b32_e32 v4, v6, v4 +; CGP-NEXT: v_or_b32_e32 v5, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -1062,7 +1062,7 @@ ; CHECK-NEXT: v_mul_lo_u32 v4, s7, v3 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v2 ; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_or_b32_e32 v4, v6, v4 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v8 ; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 @@ -1303,8 +1303,8 @@ ; GISEL-NEXT: v_mul_lo_u32 v17, s10, v7 ; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v5 ; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17 +; GISEL-NEXT: v_or_b32_e32 v8, v12, v8 +; GISEL-NEXT: v_or_b32_e32 v12, v14, v17 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v9 ; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 @@ -1559,8 +1559,8 @@ ; CGP-NEXT: v_mul_lo_u32 v17, s10, v7 ; CGP-NEXT: v_add_i32_e32 v18, vcc, 1, v5 ; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v7, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v17 +; CGP-NEXT: v_or_b32_e32 v8, v12, v8 +; CGP-NEXT: v_or_b32_e32 v12, v14, v17 ; CGP-NEXT: v_add_i32_e32 v14, vcc, 1, v9 ; CGP-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 @@ -1723,7 +1723,7 @@ ; CHECK-NEXT: v_mul_lo_u32 v4, s7, v3 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v2 ; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_or_b32_e32 v4, v6, v4 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v8 ; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 @@ -1964,8 +1964,8 @@ ; GISEL-NEXT: v_mul_lo_u32 v17, s10, v7 ; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v5 ; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17 +; GISEL-NEXT: v_or_b32_e32 v8, v12, v8 +; GISEL-NEXT: v_or_b32_e32 v12, v14, v17 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v9 ; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 @@ -2220,8 +2220,8 @@ ; CGP-NEXT: v_mul_lo_u32 v17, s10, v7 ; CGP-NEXT: v_add_i32_e32 v18, vcc, 1, v5 ; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v7, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v17 +; CGP-NEXT: v_or_b32_e32 v8, v12, v8 +; CGP-NEXT: v_or_b32_e32 v12, v14, v17 ; CGP-NEXT: v_add_i32_e32 v14, vcc, 1, v9 ; CGP-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 @@ -3323,8 +3323,8 @@ ; GISEL-NEXT: v_mul_lo_u32 v17, v4, v7 ; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v5 ; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17 +; GISEL-NEXT: v_or_b32_e32 v8, v12, v8 +; GISEL-NEXT: v_or_b32_e32 v12, v14, v17 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -38,11 +38,11 @@ ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_or_b32_e32 v3, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_or_b32_e32 v2, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 @@ -92,11 +92,11 @@ ; CGP-NEXT: v_mul_lo_u32 v1, s2, v0 ; CGP-NEXT: v_mul_lo_u32 v2, 0, v1 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CGP-NEXT: v_or_b32_e32 v1, v2, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_lo_u32 v1, 0, v0 ; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_or_b32_e32 v0, v1, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, s1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; CGP-NEXT: v_subrev_i32_e32 v1, vcc, s1, v0 @@ -172,16 +172,16 @@ ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_or_b32_e32 v5, v8, v5 +; CGP-NEXT: v_or_b32_e32 v7, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_or_b32_e32 v4, v6, v4 +; CGP-NEXT: v_or_b32_e32 v5, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 @@ -497,16 +497,16 @@ ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_or_b32_e32 v5, v8, v5 +; CGP-NEXT: v_or_b32_e32 v7, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_or_b32_e32 v4, v6, v4 +; CGP-NEXT: v_or_b32_e32 v5, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 @@ -569,11 +569,11 @@ ; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v4, 0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_or_b32_e32 v3, v4, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_lo_u32 v3, 0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_or_b32_e32 v2, v3, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 @@ -659,16 +659,16 @@ ; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 ; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_or_b32_e32 v5, v8, v5 +; CGP-NEXT: v_or_b32_e32 v7, v9, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v6, 0, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 ; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_or_b32_e32 v4, v6, v4 +; CGP-NEXT: v_or_b32_e32 v5, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v5, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -1046,7 +1046,7 @@ ; CHECK-NEXT: v_mul_hi_u32 v2, s7, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_mul_lo_u32 v3, s7, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_or_b32_e32 v3, v6, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v1, v2, vcc @@ -1283,8 +1283,8 @@ ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_mul_lo_u32 v6, s10, v6 ; GISEL-NEXT: v_mul_lo_u32 v7, s10, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; GISEL-NEXT: v_or_b32_e32 v6, v12, v6 +; GISEL-NEXT: v_or_b32_e32 v7, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 @@ -1535,8 +1535,8 @@ ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_mul_lo_u32 v6, s10, v6 ; CGP-NEXT: v_mul_lo_u32 v7, s10, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; CGP-NEXT: v_or_b32_e32 v6, v12, v6 +; CGP-NEXT: v_or_b32_e32 v7, v13, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 @@ -1697,7 +1697,7 @@ ; CHECK-NEXT: v_mul_hi_u32 v2, s7, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_mul_lo_u32 v3, s7, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_or_b32_e32 v3, v6, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v1, v2, vcc @@ -1934,8 +1934,8 @@ ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_mul_lo_u32 v6, s10, v6 ; GISEL-NEXT: v_mul_lo_u32 v7, s10, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; GISEL-NEXT: v_or_b32_e32 v6, v12, v6 +; GISEL-NEXT: v_or_b32_e32 v7, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 @@ -2186,8 +2186,8 @@ ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_mul_lo_u32 v6, s10, v6 ; CGP-NEXT: v_mul_lo_u32 v7, s10, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; CGP-NEXT: v_or_b32_e32 v6, v12, v6 +; CGP-NEXT: v_or_b32_e32 v7, v13, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 @@ -3274,8 +3274,8 @@ ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v9 ; GISEL-NEXT: v_mul_lo_u32 v6, v3, v6 ; GISEL-NEXT: v_mul_lo_u32 v7, v4, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; GISEL-NEXT: v_or_b32_e32 v6, v12, v6 +; GISEL-NEXT: v_or_b32_e32 v7, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10