diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -602,6 +602,14 @@ /// feeding a G_AND instruction \p MI. bool matchNarrowBinopFeedingAnd(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Given an G_UDIV \p MI expressing a divide by constant, return an + /// expression that implements it by multiplying by a magic number. + /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". + MachineInstr *buildUDivUsingMul(MachineInstr &MI); + /// Combine G_UDIV by constant into a multiply by magic constant. + bool matchUDivByConst(MachineInstr &MI); + void applyUDivByConst(MachineInstr &MI); + /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -397,6 +397,11 @@ Optional getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI); +/// Determines if \p MI defines a constant integer or a build vector of +/// constant integers. Treats undef values as constants. +bool isConstantOrConstantVector(MachineInstr &MI, + const MachineRegisterInfo &MRI); + /// Determines if \p MI defines a constant integer or a splat vector of /// constant integers. /// \returns the scalar constant or None. diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -694,6 +694,15 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg, bitfield_extract_from_and, bitfield_extract_from_shr]>; + +def udiv_by_const : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_UDIV):$root, + [{ return Helper.matchUDivByConst(*${root}); }]), + (apply [{ Helper.applyUDivByConst(*${root}); }])>; + +def intdiv_combines : GICombineGroup<[udiv_by_const]>; + def reassoc_ptradd : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), (match (wip_match_opcode G_PTR_ADD):$root, @@ -761,7 +770,8 @@ const_combines, xor_of_and_with_same_reg, ptr_add_with_zero, shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine, truncstore_merge, div_rem_to_divrem, funnel_shift_combines, - form_bitfield_extract, constant_fold, fabs_fneg_fold]>; + form_bitfield_extract, constant_fold, fabs_fneg_fold, + intdiv_combines]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/DataLayout.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/DivisionByConstantInfo.h" #include "llvm/Support/MathExtras.h" #include @@ -4422,6 +4423,162 @@ return true; } +MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_UDIV); + auto &UDiv = cast(MI); + Register Dst = UDiv.getReg(0); + Register LHS = UDiv.getReg(1); + Register RHS = UDiv.getReg(2); + LLT Ty = MRI.getType(Dst); + LLT ScalarTy = Ty.getScalarType(); + const unsigned EltBits = ScalarTy.getScalarSizeInBits(); + LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType(); + auto &MIB = Builder; + MIB.setInstrAndDebugLoc(MI); + + bool UseNPQ = false; + SmallVector PreShifts, PostShifts, MagicFactors, NPQFactors; + + auto BuildUDIVPattern = [&](const Constant *C) { + auto *CI = cast(C); + const APInt &Divisor = CI->getValue(); + UnsignedDivisonByConstantInfo magics = + UnsignedDivisonByConstantInfo::get(Divisor); + unsigned PreShift = 0, PostShift = 0; + + // If the divisor is even, we can avoid using the expensive fixup by + // shifting the divided value upfront. + if (magics.IsAdd != 0 && !Divisor[0]) { + PreShift = Divisor.countTrailingZeros(); + // Get magic number for the shifted divisor. + magics = + UnsignedDivisonByConstantInfo::get(Divisor.lshr(PreShift), PreShift); + assert(magics.IsAdd == 0 && "Should use cheap fixup now"); + } + + APInt Magic = magics.Magic; + + unsigned SelNPQ; + if (magics.IsAdd == 0 || Divisor.isOneValue()) { + assert(magics.ShiftAmount < Divisor.getBitWidth() && + "We shouldn't generate an undefined shift!"); + PostShift = magics.ShiftAmount; + SelNPQ = false; + } else { + PostShift = magics.ShiftAmount - 1; + SelNPQ = true; + } + + PreShifts.push_back( + MIB.buildConstant(ScalarShiftAmtTy, PreShift).getReg(0)); + MagicFactors.push_back(MIB.buildConstant(ScalarTy, Magic).getReg(0)); + NPQFactors.push_back( + MIB.buildConstant(ScalarTy, + SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1) + : APInt::getZero(EltBits)) + .getReg(0)); + PostShifts.push_back( + MIB.buildConstant(ScalarShiftAmtTy, PostShift).getReg(0)); + UseNPQ |= SelNPQ; + return true; + }; + + // Collect the shifts/magic values from each element. + bool Matched = matchUnaryPredicate(MRI, RHS, BuildUDIVPattern); + (void)Matched; + assert(Matched && "Expected unary predicate match to succeed"); + + Register PreShift, PostShift, MagicFactor, NPQFactor; + auto *RHSDef = getOpcodeDef(RHS, MRI); + if (RHSDef) { + PreShift = MIB.buildBuildVector(ShiftAmtTy, PreShifts).getReg(0); + MagicFactor = MIB.buildBuildVector(Ty, MagicFactors).getReg(0); + NPQFactor = MIB.buildBuildVector(Ty, NPQFactors).getReg(0); + PostShift = MIB.buildBuildVector(ShiftAmtTy, PostShifts).getReg(0); + } else { + assert(MRI.getType(RHS).isScalar() && + "Non-build_vector operation should have been a scalar"); + PreShift = PreShifts[0]; + MagicFactor = MagicFactors[0]; + PostShift = PostShifts[0]; + } + + Register Q = LHS; + Q = MIB.buildLShr(Ty, Q, PreShift).getReg(0); + + // Multiply the numerator (operand 0) by the magic value. + Q = MIB.buildUMulH(Ty, Q, MagicFactor).getReg(0); + + if (UseNPQ) { + Register NPQ = MIB.buildSub(Ty, LHS, Q).getReg(0); + + // For vectors we might have a mix of non-NPQ/NPQ paths, so use + // G_UMULH to act as a SRL-by-1 for NPQ, else multiply by zero. + if (Ty.isVector()) + NPQ = MIB.buildUMulH(Ty, NPQ, NPQFactor).getReg(0); + else + NPQ = MIB.buildLShr(Ty, NPQ, MIB.buildConstant(ShiftAmtTy, 1)).getReg(0); + + Q = MIB.buildAdd(Ty, NPQ, Q).getReg(0); + } + + Q = MIB.buildLShr(Ty, Q, PostShift).getReg(0); + auto One = MIB.buildConstant(Ty, 1); + auto IsOne = MIB.buildICmp( + CmpInst::Predicate::ICMP_EQ, + Ty.isScalar() ? LLT::scalar(1) : Ty.changeElementSize(1), RHS, One); + return MIB.buildSelect(Ty, IsOne, LHS, Q); +} + +bool CombinerHelper::matchUDivByConst(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_UDIV); + Register Dst = MI.getOperand(0).getReg(); + Register RHS = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(Dst); + auto *RHSDef = MRI.getVRegDef(RHS); + if (!isConstantOrConstantVector(*RHSDef, MRI)) + return false; + + auto &MF = *MI.getMF(); + AttributeList Attr = MF.getFunction().getAttributes(); + const auto &TLI = getTargetLowering(); + LLVMContext &Ctx = MF.getFunction().getContext(); + auto &DL = MF.getDataLayout(); + if (TLI.isIntDivCheap(getApproximateEVTForLLT(DstTy, DL, Ctx), Attr)) + return false; + + // Don't do this for minsize because the instruction sequence is usually + // larger. + if (MF.getFunction().hasMinSize()) + return false; + + // Don't do this if the types are not going to be legal. + if (LI) { + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}})) + return false; + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_UMULH, {DstTy}})) + return false; + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_ICMP, + {DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1), + DstTy}})) + return false; + } + + auto CheckEltValue = [&](const Constant *C) { + if (auto *CI = dyn_cast_or_null(C)) + return !CI->isZero(); + return false; + }; + return matchUnaryPredicate(MRI, RHS, CheckEltValue); +} + +void CombinerHelper::applyUDivByConst(MachineInstr &MI) { + auto *NewMI = buildUDivUsingMul(MI); + replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg()); +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -1016,6 +1016,23 @@ return RegOrConstant(Reg); } +bool llvm::isConstantOrConstantVector(MachineInstr &MI, + const MachineRegisterInfo &MRI) { + Register Def = MI.getOperand(0).getReg(); + if (auto C = getIConstantVRegValWithLookThrough(Def, MRI)) + return true; + GBuildVector *BV = dyn_cast(&MI); + if (!BV) + return false; + for (unsigned SrcIdx = 0; SrcIdx < BV->getNumSources(); ++SrcIdx) { + if (getIConstantVRegValWithLookThrough(BV->getSourceReg(SrcIdx), MRI) || + getOpcodeDef(BV->getSourceReg(SrcIdx), MRI)) + continue; + return false; + } + return true; +} + Optional llvm::isConstantOrConstantSplatVector(MachineInstr &MI, const MachineRegisterInfo &MRI) { diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll @@ -0,0 +1,287 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefixes=SDAG +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel | FileCheck %s --check-prefixes=GISEL + +; These tests are taken from the combine-udiv.ll in X86. +define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) { +; SDAG-LABEL: combine_vec_udiv_uniform: +; SDAG: // %bb.0: +; SDAG-NEXT: mov w8, #25645 +; SDAG-NEXT: dup v1.8h, w8 +; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h +; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h +; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h +; SDAG-NEXT: usra v1.8h, v0.8h, #1 +; SDAG-NEXT: ushr v0.8h, v1.8h, #4 +; SDAG-NEXT: ret +; +; GISEL-LABEL: combine_vec_udiv_uniform: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, .LCPI0_1 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI0_1] +; GISEL-NEXT: adrp x8, .LCPI0_0 +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI0_0] +; GISEL-NEXT: umull2 v3.4s, v0.8h, v1.8h +; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h +; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h +; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h +; GISEL-NEXT: umull2 v3.4s, v0.8h, v2.8h +; GISEL-NEXT: umull v0.4s, v0.4h, v2.4h +; GISEL-NEXT: uzp2 v0.8h, v0.8h, v3.8h +; GISEL-NEXT: add v0.8h, v0.8h, v1.8h +; GISEL-NEXT: ushr v0.8h, v0.8h, #4 +; GISEL-NEXT: ret + %1 = udiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { +; SDAG-LABEL: combine_vec_udiv_nonuniform: +; SDAG: // %bb.0: +; SDAG-NEXT: adrp x8, .LCPI1_0 +; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; SDAG-NEXT: adrp x8, .LCPI1_1 +; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] +; SDAG-NEXT: adrp x8, .LCPI1_2 +; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI1_2] +; SDAG-NEXT: ushl v1.8h, v0.8h, v1.8h +; SDAG-NEXT: umull2 v4.4s, v1.8h, v2.8h +; SDAG-NEXT: umull v1.4s, v1.4h, v2.4h +; SDAG-NEXT: adrp x8, .LCPI1_3 +; SDAG-NEXT: uzp2 v1.8h, v1.8h, v4.8h +; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_3] +; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h +; SDAG-NEXT: umull2 v4.4s, v0.8h, v3.8h +; SDAG-NEXT: umull v0.4s, v0.4h, v3.4h +; SDAG-NEXT: uzp2 v0.8h, v0.8h, v4.8h +; SDAG-NEXT: add v0.8h, v0.8h, v1.8h +; SDAG-NEXT: ushl v0.8h, v0.8h, v2.8h +; SDAG-NEXT: ret +; +; GISEL-LABEL: combine_vec_udiv_nonuniform: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, .LCPI1_5 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI1_5] +; GISEL-NEXT: adrp x8, .LCPI1_4 +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_4] +; GISEL-NEXT: adrp x8, .LCPI1_3 +; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI1_3] +; GISEL-NEXT: adrp x8, .LCPI1_1 +; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI1_1] +; GISEL-NEXT: adrp x8, .LCPI1_0 +; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI1_0] +; GISEL-NEXT: adrp x8, .LCPI1_2 +; GISEL-NEXT: neg v2.8h, v2.8h +; GISEL-NEXT: ldr q6, [x8, :lo12:.LCPI1_2] +; GISEL-NEXT: ushl v2.8h, v0.8h, v2.8h +; GISEL-NEXT: cmeq v1.8h, v1.8h, v5.8h +; GISEL-NEXT: umull2 v5.4s, v2.8h, v3.8h +; GISEL-NEXT: umull v2.4s, v2.4h, v3.4h +; GISEL-NEXT: uzp2 v2.8h, v2.8h, v5.8h +; GISEL-NEXT: sub v3.8h, v0.8h, v2.8h +; GISEL-NEXT: umull2 v5.4s, v3.8h, v6.8h +; GISEL-NEXT: umull v3.4s, v3.4h, v6.4h +; GISEL-NEXT: uzp2 v3.8h, v3.8h, v5.8h +; GISEL-NEXT: neg v4.8h, v4.8h +; GISEL-NEXT: shl v1.8h, v1.8h, #15 +; GISEL-NEXT: add v2.8h, v3.8h, v2.8h +; GISEL-NEXT: ushl v2.8h, v2.8h, v4.8h +; GISEL-NEXT: sshr v1.8h, v1.8h, #15 +; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b +; GISEL-NEXT: ret + %1 = udiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) { +; SDAG-LABEL: combine_vec_udiv_nonuniform2: +; SDAG: // %bb.0: +; SDAG-NEXT: adrp x8, .LCPI2_0 +; SDAG-NEXT: adrp x9, .LCPI2_1 +; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; SDAG-NEXT: ldr q2, [x9, :lo12:.LCPI2_1] +; SDAG-NEXT: adrp x8, .LCPI2_2 +; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI2_2] +; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h +; SDAG-NEXT: umull2 v1.4s, v0.8h, v2.8h +; SDAG-NEXT: umull v0.4s, v0.4h, v2.4h +; SDAG-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; SDAG-NEXT: ushl v0.8h, v0.8h, v3.8h +; SDAG-NEXT: ret +; +; GISEL-LABEL: combine_vec_udiv_nonuniform2: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, .LCPI2_4 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_4] +; GISEL-NEXT: adrp x8, .LCPI2_3 +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_3] +; GISEL-NEXT: adrp x8, .LCPI2_1 +; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI2_1] +; GISEL-NEXT: adrp x8, .LCPI2_0 +; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI2_0] +; GISEL-NEXT: adrp x8, .LCPI2_2 +; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI2_2] +; GISEL-NEXT: neg v2.8h, v2.8h +; GISEL-NEXT: ushl v2.8h, v0.8h, v2.8h +; GISEL-NEXT: cmeq v1.8h, v1.8h, v4.8h +; GISEL-NEXT: umull2 v4.4s, v2.8h, v5.8h +; GISEL-NEXT: umull v2.4s, v2.4h, v5.4h +; GISEL-NEXT: neg v3.8h, v3.8h +; GISEL-NEXT: shl v1.8h, v1.8h, #15 +; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h +; GISEL-NEXT: ushl v2.8h, v2.8h, v3.8h +; GISEL-NEXT: sshr v1.8h, v1.8h, #15 +; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b +; GISEL-NEXT: ret + %1 = udiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) { +; SDAG-LABEL: combine_vec_udiv_nonuniform3: +; SDAG: // %bb.0: +; SDAG-NEXT: adrp x8, .LCPI3_0 +; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; SDAG-NEXT: adrp x8, .LCPI3_1 +; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI3_1] +; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h +; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h +; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h +; SDAG-NEXT: usra v1.8h, v0.8h, #1 +; SDAG-NEXT: ushl v0.8h, v1.8h, v3.8h +; SDAG-NEXT: ret +; +; GISEL-LABEL: combine_vec_udiv_nonuniform3: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, .LCPI3_4 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_4] +; GISEL-NEXT: adrp x8, .LCPI3_3 +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] +; GISEL-NEXT: adrp x8, .LCPI3_2 +; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] +; GISEL-NEXT: adrp x8, .LCPI3_1 +; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI3_1] +; GISEL-NEXT: adrp x8, .LCPI3_0 +; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI3_0] +; GISEL-NEXT: umull2 v6.4s, v0.8h, v2.8h +; GISEL-NEXT: umull v2.4s, v0.4h, v2.4h +; GISEL-NEXT: uzp2 v2.8h, v2.8h, v6.8h +; GISEL-NEXT: cmeq v1.8h, v1.8h, v5.8h +; GISEL-NEXT: sub v5.8h, v0.8h, v2.8h +; GISEL-NEXT: umull2 v6.4s, v5.8h, v3.8h +; GISEL-NEXT: umull v3.4s, v5.4h, v3.4h +; GISEL-NEXT: uzp2 v3.8h, v3.8h, v6.8h +; GISEL-NEXT: neg v4.8h, v4.8h +; GISEL-NEXT: shl v1.8h, v1.8h, #15 +; GISEL-NEXT: add v2.8h, v3.8h, v2.8h +; GISEL-NEXT: ushl v2.8h, v2.8h, v4.8h +; GISEL-NEXT: sshr v1.8h, v1.8h, #15 +; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b +; GISEL-NEXT: ret + %1 = udiv <8 x i16> %x, + ret <8 x i16> %1 +} + +define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { +; SDAG-LABEL: combine_vec_udiv_nonuniform4: +; SDAG: // %bb.0: +; SDAG-NEXT: adrp x8, .LCPI4_0 +; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; SDAG-NEXT: adrp x8, .LCPI4_1 +; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] +; SDAG-NEXT: adrp x8, .LCPI4_2 +; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] +; SDAG-NEXT: adrp x8, .LCPI4_3 +; SDAG-NEXT: ldr q4, [x8, :lo12:.LCPI4_3] +; SDAG-NEXT: umull2 v5.8h, v0.16b, v1.16b +; SDAG-NEXT: umull v1.8h, v0.8b, v1.8b +; SDAG-NEXT: uzp2 v1.16b, v1.16b, v5.16b +; SDAG-NEXT: ushl v1.16b, v1.16b, v2.16b +; SDAG-NEXT: and v1.16b, v1.16b, v3.16b +; SDAG-NEXT: and v0.16b, v0.16b, v4.16b +; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b +; SDAG-NEXT: ret +; +; GISEL-LABEL: combine_vec_udiv_nonuniform4: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, .LCPI4_3 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI4_3] +; GISEL-NEXT: adrp x8, .LCPI4_0 +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI4_0] +; GISEL-NEXT: adrp x8, .LCPI4_2 +; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] +; GISEL-NEXT: adrp x8, .LCPI4_1 +; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI4_1] +; GISEL-NEXT: cmeq v1.16b, v1.16b, v2.16b +; GISEL-NEXT: umull2 v2.8h, v0.16b, v3.16b +; GISEL-NEXT: umull v3.8h, v0.8b, v3.8b +; GISEL-NEXT: neg v4.16b, v4.16b +; GISEL-NEXT: uzp2 v2.16b, v3.16b, v2.16b +; GISEL-NEXT: shl v1.16b, v1.16b, #7 +; GISEL-NEXT: ushl v2.16b, v2.16b, v4.16b +; GISEL-NEXT: sshr v1.16b, v1.16b, #7 +; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b +; GISEL-NEXT: ret + %div = udiv <16 x i8> %x, + ret <16 x i8> %div +} + +define <8 x i16> @pr38477(<8 x i16> %a0) { +; SDAG-LABEL: pr38477: +; SDAG: // %bb.0: +; SDAG-NEXT: adrp x8, .LCPI5_0 +; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] +; SDAG-NEXT: adrp x8, .LCPI5_1 +; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] +; SDAG-NEXT: adrp x8, .LCPI5_2 +; SDAG-NEXT: umull2 v4.4s, v0.8h, v1.8h +; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h +; SDAG-NEXT: uzp2 v1.8h, v1.8h, v4.8h +; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] +; SDAG-NEXT: adrp x8, .LCPI5_3 +; SDAG-NEXT: sub v4.8h, v0.8h, v1.8h +; SDAG-NEXT: umull2 v5.4s, v4.8h, v2.8h +; SDAG-NEXT: umull v2.4s, v4.4h, v2.4h +; SDAG-NEXT: ldr q4, [x8, :lo12:.LCPI5_3] +; SDAG-NEXT: adrp x8, .LCPI5_4 +; SDAG-NEXT: uzp2 v2.8h, v2.8h, v5.8h +; SDAG-NEXT: ldr q5, [x8, :lo12:.LCPI5_4] +; SDAG-NEXT: add v1.8h, v2.8h, v1.8h +; SDAG-NEXT: ushl v1.8h, v1.8h, v3.8h +; SDAG-NEXT: and v1.16b, v1.16b, v4.16b +; SDAG-NEXT: and v0.16b, v0.16b, v5.16b +; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b +; SDAG-NEXT: ret +; +; GISEL-LABEL: pr38477: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, .LCPI5_4 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI5_4] +; GISEL-NEXT: adrp x8, .LCPI5_3 +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI5_3] +; GISEL-NEXT: adrp x8, .LCPI5_2 +; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] +; GISEL-NEXT: adrp x8, .LCPI5_1 +; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI5_1] +; GISEL-NEXT: adrp x8, .LCPI5_0 +; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI5_0] +; GISEL-NEXT: umull2 v6.4s, v0.8h, v2.8h +; GISEL-NEXT: umull v2.4s, v0.4h, v2.4h +; GISEL-NEXT: uzp2 v2.8h, v2.8h, v6.8h +; GISEL-NEXT: cmeq v1.8h, v1.8h, v5.8h +; GISEL-NEXT: sub v5.8h, v0.8h, v2.8h +; GISEL-NEXT: umull2 v6.4s, v5.8h, v3.8h +; GISEL-NEXT: umull v3.4s, v5.4h, v3.4h +; GISEL-NEXT: uzp2 v3.8h, v3.8h, v6.8h +; GISEL-NEXT: neg v4.8h, v4.8h +; GISEL-NEXT: shl v1.8h, v1.8h, #15 +; GISEL-NEXT: add v2.8h, v3.8h, v2.8h +; GISEL-NEXT: ushl v2.8h, v2.8h, v4.8h +; GISEL-NEXT: sshr v1.8h, v1.8h, #15 +; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b +; GISEL-NEXT: ret + %1 = udiv <8 x i16> %a0, + ret <8 x i16> %1 +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir @@ -0,0 +1,353 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s +--- +name: udiv_by_scalar_const +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: udiv_by_scalar_const + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 818089009 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[C1]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C2]](s32) + ; CHECK-NEXT: $w0 = COPY [[LSHR1]](s32) + %0:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 42 + %2:_(s32) = G_UDIV %0(s32), %cst(s32) + $w0 = COPY %2(s32) +... +--- +name: combine_vec_udiv_uniform +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$q0' } +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: combine_vec_udiv_uniform + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]] + ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR2]](<8 x s16>) + ; CHECK-NEXT: $q0 = COPY [[LSHR]](<8 x s16>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<8 x s16>) = COPY $q0 + %2:_(s16) = G_CONSTANT i16 23 + %1:_(<8 x s16>) = G_BUILD_VECTOR %2(s16), %2(s16), %2(s16), %2(s16), %2(s16), %2(s16), %2(s16), %2(s16) + %3:_(<8 x s16>) = G_UDIV %0, %1 + $q0 = COPY %3(<8 x s16>) + RET_ReallyLR implicit $q0 + +... +--- +name: combine_vec_udiv_nonuniform +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$q0' } +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: combine_vec_udiv_nonuniform + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 23 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 34 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -23 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 56 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 128 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -256 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16) + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645 + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 -3855 + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 5 + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 8195 + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 13 + ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 3 + ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363 + ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 512 + ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32767 + ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32639 + ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C15]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C9]](s16), [[C11]](s16), [[C13]](s16), [[C16]](s16), [[C17]](s16), [[C18]](s16), [[C20]](s16), [[C21]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C7]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C10]](s16), [[C12]](s16), [[C14]](s16), [[C8]](s16), [[C8]](s16), [[C19]](s16), [[C19]](s16), [[C8]](s16) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[BUILD_VECTOR1]](<8 x s16>) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[LSHR]], [[BUILD_VECTOR2]] + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]] + ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR3]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR4]](<8 x s16>) + ; CHECK-NEXT: [[C22:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR5]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR1]] + ; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<8 x s16>) = COPY $q0 + %2:_(s16) = G_CONSTANT i16 23 + %3:_(s16) = G_CONSTANT i16 34 + %4:_(s16) = G_CONSTANT i16 -23 + %5:_(s16) = G_CONSTANT i16 56 + %6:_(s16) = G_CONSTANT i16 128 + %7:_(s16) = G_CONSTANT i16 -1 + %8:_(s16) = G_CONSTANT i16 -256 + %9:_(s16) = G_CONSTANT i16 -32768 + %1:_(<8 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16), %5(s16), %6(s16), %7(s16), %8(s16), %9(s16) + %10:_(<8 x s16>) = G_UDIV %0, %1 + $q0 = COPY %10(<8 x s16>) + RET_ReallyLR implicit $q0 + +... +--- +name: combine_vec_udiv_nonuniform2 +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$q0' } +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: combine_vec_udiv_nonuniform2 + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -34 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 35 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 36 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -37 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 38 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -39 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 40 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 -41 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16) + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 16393 + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 13 + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 -5617 + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 5 + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 -7281 + ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32749 + ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 -10347 + ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 8197 + ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 -13107 + ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32747 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C9]](s16), [[C12]](s16), [[C14]](s16), [[C15]](s16), [[C17]](s16), [[C18]](s16), [[C19]](s16), [[C20]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C11]](s16), [[C13]](s16), [[C13]](s16), [[C16]](s16), [[C13]](s16), [[C11]](s16), [[C13]](s16), [[C16]](s16) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[BUILD_VECTOR1]](<8 x s16>) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[LSHR]], [[BUILD_VECTOR2]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[UMULH]], [[BUILD_VECTOR3]](<8 x s16>) + ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR4]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR1]] + ; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<8 x s16>) = COPY $q0 + %2:_(s16) = G_CONSTANT i16 -34 + %3:_(s16) = G_CONSTANT i16 35 + %4:_(s16) = G_CONSTANT i16 36 + %5:_(s16) = G_CONSTANT i16 -37 + %6:_(s16) = G_CONSTANT i16 38 + %7:_(s16) = G_CONSTANT i16 -39 + %8:_(s16) = G_CONSTANT i16 40 + %9:_(s16) = G_CONSTANT i16 -41 + %1:_(<8 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16), %5(s16), %6(s16), %7(s16), %8(s16), %9(s16) + %10:_(<8 x s16>) = G_UDIV %0, %1 + $q0 = COPY %10(<8 x s16>) + RET_ReallyLR implicit $q0 + +... +--- +name: combine_vec_udiv_nonuniform3 +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$q0' } +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: combine_vec_udiv_nonuniform3 + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 7 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 23 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 25 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 27 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 31 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 47 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 63 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 127 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16) + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363 + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645 + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 18351 + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 12137 + ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115 + ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 23705 + ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 5 + ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 1041 + ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 517 + ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 6 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C11]](s16), [[C13]](s16), [[C14]](s16), [[C15]](s16), [[C16]](s16), [[C18]](s16), [[C19]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C10]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C17]](s16), [[C17]](s16), [[C20]](s16) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]] + ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR2]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR3]](<8 x s16>) + ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR4]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR]] + ; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<8 x s16>) = COPY $q0 + %2:_(s16) = G_CONSTANT i16 7 + %3:_(s16) = G_CONSTANT i16 23 + %4:_(s16) = G_CONSTANT i16 25 + %5:_(s16) = G_CONSTANT i16 27 + %6:_(s16) = G_CONSTANT i16 31 + %7:_(s16) = G_CONSTANT i16 47 + %8:_(s16) = G_CONSTANT i16 63 + %9:_(s16) = G_CONSTANT i16 127 + %1:_(<8 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16), %5(s16), %6(s16), %7(s16), %8(s16), %9(s16) + %10:_(<8 x s16>) = G_UDIV %0, %1 + $q0 = COPY %10(<8 x s16>) + RET_ReallyLR implicit $q0 + +... +--- +name: combine_vec_udiv_nonuniform4 +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$q0' } +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: combine_vec_udiv_nonuniform4 + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 -64 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 1 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 0 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s8) = G_CONSTANT i8 -85 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s8) = G_CONSTANT i8 7 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C3]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C4]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<16 x s8>) = G_UMULH [[COPY]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<16 x s8>) = G_LSHR [[UMULH]], [[BUILD_VECTOR2]](<16 x s8>) + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<16 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<16 x s8>), [[BUILD_VECTOR3]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<16 x s8>) = G_SELECT [[ICMP]](<16 x s1>), [[COPY]], [[LSHR]] + ; CHECK-NEXT: $q0 = COPY [[SELECT]](<16 x s8>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<16 x s8>) = COPY $q0 + %2:_(s8) = G_CONSTANT i8 -64 + %3:_(s8) = G_CONSTANT i8 1 + %1:_(<16 x s8>) = G_BUILD_VECTOR %2(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8) + %4:_(<16 x s8>) = G_UDIV %0, %1 + $q0 = COPY %4(<16 x s8>) + RET_ReallyLR implicit $q0 + +... +--- +name: pr38477 +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$q0' } +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: pr38477 + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 119 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 73 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -111 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -3 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 118 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 32 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 31 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16) + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 4957 + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 6 + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 -8079 + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 4103 + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 12 + ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 16385 + ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 14 + ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 -29991 + ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 2048 + ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115 + ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C9]](s16), [[C12]](s16), [[C13]](s16), [[C15]](s16), [[C17]](s16), [[C18]](s16), [[C19]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C10]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C10]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C11]](s16), [[C11]](s16), [[C14]](s16), [[C16]](s16), [[C11]](s16), [[C8]](s16), [[C20]](s16) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]] + ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR2]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR3]](<8 x s16>) + ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR4]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR]] + ; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<8 x s16>) = COPY $q0 + %2:_(s16) = G_CONSTANT i16 1 + %3:_(s16) = G_CONSTANT i16 119 + %4:_(s16) = G_CONSTANT i16 73 + %5:_(s16) = G_CONSTANT i16 -111 + %6:_(s16) = G_CONSTANT i16 -3 + %7:_(s16) = G_CONSTANT i16 118 + %8:_(s16) = G_CONSTANT i16 32 + %9:_(s16) = G_CONSTANT i16 31 + %1:_(<8 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16), %5(s16), %6(s16), %7(s16), %8(s16), %9(s16) + %10:_(<8 x s16>) = G_UDIV %0, %1 + $q0 = COPY %10(<8 x s16>) + RET_ReallyLR implicit $q0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -222,117 +222,21 @@ ; CHECK-LABEL: v_udiv_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, 0x45800000 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xfffff000 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v1 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CHECK-NEXT: v_subrev_i32_e64 v2, s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; CHECK-NEXT: v_mov_b32_e32 v1, 0x100000 +; CHECK-NEXT: v_mul_hi_u32 v0, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv i32 %num, 4096 ret i32 %result } define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) { -; GISEL-LABEL: v_udiv_v2i32_pow2k_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v2, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v3, 0xfffff000 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v2 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v3, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6 -; GISEL-NEXT: v_mul_hi_u32 v3, v5, v3 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 12, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 12, v3 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GISEL-NEXT: v_subrev_i32_e64 v5, s[4:5], s4, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v6, s[6:7], v1, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v3 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_udiv_v2i32_pow2k_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s8, 0x1000 -; CGP-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 -; CGP-NEXT: s_movk_i32 s4, 0xf000 -; CGP-NEXT: v_mov_b32_e32 v3, 0xfffff000 -; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, 0x45800000 -; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v6, s4, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v2, v6 -; CGP-NEXT: v_mul_hi_u32 v3, v5, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v5, 12, v2 -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v3 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; CGP-NEXT: v_subrev_i32_e64 v5, s[4:5], s8, v0 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v1 -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v6, s[6:7], v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v3 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_udiv_v2i32_pow2k_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x100000 +; CHECK-NEXT: v_mul_hi_u32 v0, v0, s4 +; CHECK-NEXT: v_mul_hi_u32 v1, v1, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i32> %num, ret <2 x i32> %result } @@ -341,25 +245,12 @@ ; CHECK-LABEL: v_udiv_i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, 0x4996c7d8 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xffed2705 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0xb2a50881 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v1, s6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CHECK-NEXT: v_subrev_i32_e64 v2, s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv i32 %num, 1235195 ret i32 %result @@ -369,87 +260,34 @@ ; GISEL-LABEL: v_udiv_v2i32_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb -; GISEL-NEXT: v_mov_b32_e32 v2, 0x12d8fb -; GISEL-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v2 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v3, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6 -; GISEL-NEXT: v_mul_hi_u32 v3, v5, v3 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, s8 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GISEL-NEXT: v_subrev_i32_e64 v5, s[4:5], s8, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v6, s[6:7], v1, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v3 -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; GISEL-NEXT: s_mov_b32 s4, 0xb2a50881 +; GISEL-NEXT: s_brev_b32 s5, 1 +; GISEL-NEXT: v_mul_hi_u32 v2, v0, s4 +; GISEL-NEXT: v_mul_hi_u32 v3, v1, s4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_mul_hi_u32 v0, v0, s5 +; GISEL-NEXT: v_mul_hi_u32 v1, v1, s5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v0 +; GISEL-NEXT: v_lshrrev_b32_e32 v1, 20, v1 ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s8, 0x12d8fb -; CGP-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 -; CGP-NEXT: s_mov_b32 s4, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v3, 0x12d8fb -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, 0x4996c7d8 -; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mul_lo_u32 v5, s4, v2 -; CGP-NEXT: v_mul_lo_u32 v6, s4, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v2, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v4, v6 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 -; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v5, v2, s8 -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; CGP-NEXT: v_mul_lo_u32 v7, v4, s8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; CGP-NEXT: v_subrev_i32_e64 v5, s[4:5], s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v6, s[6:7], v1, v3 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc +; CGP-NEXT: s_mov_b32 s4, 0xb2a50881 +; CGP-NEXT: v_mul_hi_u32 v2, v0, s4 +; CGP-NEXT: v_mul_hi_u32 v3, v1, s4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; CGP-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; CGP-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CGP-NEXT: v_lshrrev_b32_e32 v0, 20, v0 +; CGP-NEXT: v_lshrrev_b32_e32 v1, 20, v1 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i32> %num, ret <2 x i32> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -969,659 +969,78 @@ ; CHECK-LABEL: v_udiv_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_movk_i32 s6, 0xf000 -; CHECK-NEXT: s_movk_i32 s7, 0x1000 -; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CHECK-NEXT: s_bfe_i32 s5, -1, 0x10000 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_mov_b32_e32 v3, s4 -; CHECK-NEXT: v_mov_b32_e32 v4, s5 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v5, v5 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v5 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v7 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v9, v2, v6 -; CHECK-NEXT: v_mul_lo_u32 v11, v5, v6 -; CHECK-NEXT: v_mul_hi_u32 v12, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v10 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_addc_u32_e64 v7, s[4:5], v5, v6, vcc -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v10, s6, v7 -; CHECK-NEXT: v_mul_lo_u32 v11, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v12, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v6 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_mul_lo_u32 v9, v2, v8 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v8 -; CHECK-NEXT: v_mul_hi_u32 v13, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v7, v7, v8 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v10, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v11 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v0, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: s_mov_b32 s4, 0x100000 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, 0 +; CHECK-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; CHECK-NEXT: v_mul_hi_u32 v5, v1, 0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_lo_u32 v7, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, 0, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, s7, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v6, s7, v5 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, 1, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v10 -; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v11, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v6, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5] -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s7, v0 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; CHECK-NEXT: v_mul_hi_u32 v0, v0, s4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v1, v1, s4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %num, 4096 ret i64 %result } define <2 x i64> @v_udiv_v2i64_pow2k_denom(<2 x i64> %num) { -; GISEL-LABEL: v_udiv_v2i64_pow2k_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s12, 0x1000 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s12 -; GISEL-NEXT: s_sub_u32 s8, 0, s12 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: v_mov_b32_e32 v6, v4 -; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v6 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s9, 0, 0 -; GISEL-NEXT: s_bfe_i32 s10, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; GISEL-NEXT: s_sub_u32 s13, 0, s12 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v6, v6 -; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_trunc_f32_e32 v7, v7 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_mul_lo_u32 v8, s13, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, s13, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, s13, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v14, s9, v5 -; GISEL-NEXT: v_mul_hi_u32 v15, s8, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v17, v5, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v18, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, s13, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, s6, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, s13, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v15, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v16, s9, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v18, s8, v13 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v15 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v5, v15 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v16 -; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], v19, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], v17, v18 -; GISEL-NEXT: v_mul_lo_u32 v17, s13, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[8:9], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v17, v4, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[8:9], v12, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 -; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v18, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v14, v17 -; GISEL-NEXT: v_mov_b32_e32 v14, s10 -; GISEL-NEXT: v_mov_b32_e32 v17, s11 -; GISEL-NEXT: s_bfe_i32 s13, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s14, -1, 0x10000 -; GISEL-NEXT: v_add_i32_e64 v6, s[10:11], v6, v8 -; GISEL-NEXT: v_mov_b32_e32 v8, s13 -; GISEL-NEXT: v_add_i32_e64 v7, s[10:11], v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[8:9], v18, v15 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v10, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v18, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[8:9], v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v13, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v5, v16 -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v19, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v19, v16 -; GISEL-NEXT: v_mov_b32_e32 v19, s14 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v15 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v18 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc -; GISEL-NEXT: v_addc_u32_e64 v7, vcc, v7, v12, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 -; GISEL-NEXT: v_mul_lo_u32 v13, v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v18, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v15, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, s12, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, s12, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_lo_u32 v12, s12, v5 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v18, s12, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_mul_lo_u32 v9, s12, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, s12, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v6, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v3, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9 -; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s12, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v12 -; GISEL-NEXT: v_subb_u32_e64 v12, s[6:7], v1, v11, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v11 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], 1, v13 -; GISEL-NEXT: v_addc_u32_e64 v11, s[8:9], 0, v16, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7] -; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v14, v9, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], 1, v15 -; GISEL-NEXT: v_addc_u32_e64 v14, s[6:7], 0, v18, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, s12, v2 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v13, v10, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, v15, v12, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v11, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v14, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_udiv_v2i64_pow2k_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_movk_i32 s8, 0xf000 -; CGP-NEXT: s_movk_i32 s12, 0x1000 -; CGP-NEXT: s_bfe_i32 s10, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s11, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s13, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s14, -1, 0x10000 -; CGP-NEXT: v_mov_b32_e32 v6, v4 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v6 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mul_lo_u32 v8, s8, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s8, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v13, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v6, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v8 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v19, v5, v9 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v15 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v11, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v12, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v14, s8, v4 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v15, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v16, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v17, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v18, s8, v13 -; CGP-NEXT: v_mul_lo_u32 v19, v13, v15 -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v18 -; CGP-NEXT: v_mul_hi_u32 v18, v5, v15 -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 -; CGP-NEXT: v_mul_lo_u32 v17, v5, v16 -; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v19, v17 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v17, v18 -; CGP-NEXT: v_mul_lo_u32 v17, s8, v10 -; CGP-NEXT: v_mul_lo_u32 v18, v10, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[8:9], v12, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v4, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[8:9], v12, v14 -; CGP-NEXT: v_mul_lo_u32 v14, v4, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v18, v14 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v17 -; CGP-NEXT: v_mov_b32_e32 v14, s10 -; CGP-NEXT: v_mov_b32_e32 v17, s11 -; CGP-NEXT: v_add_i32_e64 v6, s[10:11], v6, v8 -; CGP-NEXT: v_mov_b32_e32 v8, s13 -; CGP-NEXT: v_add_i32_e64 v7, s[10:11], v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v10, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v15, s[8:9], v18, v15 -; CGP-NEXT: v_mul_lo_u32 v18, v10, v12 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v12 -; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v18, v9 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v12, s[8:9], v18, v12 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 -; CGP-NEXT: v_mul_lo_u32 v19, v13, v16 -; CGP-NEXT: v_mul_hi_u32 v13, v13, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v5, v16 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v19, v11 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v19, v16 -; CGP-NEXT: v_mov_b32_e32 v19, s14 -; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v12, v15 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v16, v18 -; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v13, v15 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc -; CGP-NEXT: v_addc_u32_e64 v7, vcc, v7, v12, s[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v0, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_mul_lo_u32 v13, v2, v6 -; CGP-NEXT: v_mul_lo_u32 v15, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v16, v2, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 -; CGP-NEXT: v_mul_lo_u32 v18, v0, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v11, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v12, v0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v15, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v16 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v18, v16 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_lo_u32 v10, s12, v4 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v4 -; CGP-NEXT: v_mul_hi_u32 v15, s12, v4 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v12, s12, v5 -; CGP-NEXT: v_mul_lo_u32 v16, 0, v5 -; CGP-NEXT: v_mul_hi_u32 v18, s12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v9, s12, v6 -; CGP-NEXT: v_mul_lo_u32 v11, s12, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v4 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v6, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v5 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v7, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v3, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s12, v2 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v12 -; CGP-NEXT: v_subb_u32_e64 v12, s[6:7], v1, v11, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v11 -; CGP-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; CGP-NEXT: v_add_i32_e64 v10, s[8:9], 1, v13 -; CGP-NEXT: v_addc_u32_e64 v11, s[8:9], 0, v16, s[8:9] -; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7] -; CGP-NEXT: v_cmp_le_u32_e64 s[6:7], s12, v0 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; CGP-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v12 -; CGP-NEXT: v_cndmask_b32_e64 v9, v14, v9, s[6:7] -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], 1, v15 -; CGP-NEXT: v_addc_u32_e64 v14, s[6:7], 0, v18, s[6:7] -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s12, v2 -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v10, vcc -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; CGP-NEXT: v_cndmask_b32_e64 v0, v15, v12, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v3, v16, v11, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v1, v18, v14, s[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 -; CGP-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_udiv_v2i64_pow2k_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x100000 +; CHECK-NEXT: v_lshlrev_b32_e32 v4, 20, v0 +; CHECK-NEXT: v_mul_hi_u32 v5, v0, 0 +; CHECK-NEXT: v_lshlrev_b32_e32 v6, 20, v1 +; CHECK-NEXT: v_mul_hi_u32 v7, v1, 0 +; CHECK-NEXT: v_lshlrev_b32_e32 v8, 20, v2 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, 0 +; CHECK-NEXT: v_lshlrev_b32_e32 v10, 20, v3 +; CHECK-NEXT: v_mul_hi_u32 v11, v3, 0 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v0, v0, s4 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v1, v1, s4 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, 0, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v2, v2, s4 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v3, v3, s4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v11, v8 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i64> %num, ret <2 x i64> %result } @@ -1630,659 +1049,86 @@ ; CHECK-LABEL: v_udiv_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 -; CHECK-NEXT: s_mov_b32 s7, 0x12d8fb -; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CHECK-NEXT: s_bfe_i32 s5, -1, 0x10000 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_mov_b32_e32 v3, s4 -; CHECK-NEXT: v_mov_b32_e32 v4, s5 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v5, v5 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v5 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v7 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v9, v2, v6 -; CHECK-NEXT: v_mul_lo_u32 v11, v5, v6 -; CHECK-NEXT: v_mul_hi_u32 v12, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v10 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_addc_u32_e64 v7, s[4:5], v5, v6, vcc -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v10, s6, v7 -; CHECK-NEXT: v_mul_lo_u32 v11, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v12, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v6 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; CHECK-NEXT: v_mul_lo_u32 v9, v2, v8 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v8 -; CHECK-NEXT: v_mul_hi_u32 v13, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v7, v7, v8 -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v10, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v11 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v0, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_lo_u32 v7, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, 0, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, s7, v2 +; CHECK-NEXT: s_mov_b32 s4, 0x1fb03c31 +; CHECK-NEXT: s_mov_b32 s5, 0xd9528440 +; CHECK-NEXT: v_mul_lo_u32 v2, v1, s4 +; CHECK-NEXT: v_mul_lo_u32 v3, v0, s5 +; CHECK-NEXT: v_mul_hi_u32 v4, v0, s4 +; CHECK-NEXT: v_mul_lo_u32 v5, v1, s5 +; CHECK-NEXT: v_mul_hi_u32 v6, v1, s4 +; CHECK-NEXT: v_mul_hi_u32 v0, v0, s5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v6, s7, v5 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, 1, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v10 -; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v11, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v6, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5] -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s7, v0 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v1, v1, s5 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 20 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %num, 1235195 ret i64 %result } define <2 x i64> @v_udiv_v2i64_oddk_denom(<2 x i64> %num) { -; GISEL-LABEL: v_udiv_v2i64_oddk_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s12 -; GISEL-NEXT: s_sub_u32 s8, 0, s12 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: v_mov_b32_e32 v6, v4 -; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v6 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s9, 0, 0 -; GISEL-NEXT: s_bfe_i32 s10, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; GISEL-NEXT: s_sub_u32 s13, 0, s12 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v6, v6 -; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_trunc_f32_e32 v7, v7 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_mul_lo_u32 v8, s13, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, s13, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, s13, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v14, s9, v5 -; GISEL-NEXT: v_mul_hi_u32 v15, s8, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v17, v5, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v18, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, s13, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, s6, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, s13, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v15, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v16, s9, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v18, s8, v13 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v15 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v5, v15 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v16 -; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], v19, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], v17, v18 -; GISEL-NEXT: v_mul_lo_u32 v17, s13, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[8:9], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v17, v4, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[8:9], v12, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 -; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v18, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v14, s[8:9], v14, v17 -; GISEL-NEXT: v_mov_b32_e32 v14, s10 -; GISEL-NEXT: v_mov_b32_e32 v17, s11 -; GISEL-NEXT: s_bfe_i32 s13, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s14, -1, 0x10000 -; GISEL-NEXT: v_add_i32_e64 v6, s[10:11], v6, v8 -; GISEL-NEXT: v_mov_b32_e32 v8, s13 -; GISEL-NEXT: v_add_i32_e64 v7, s[10:11], v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[8:9], v18, v15 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v10, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v18, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[8:9], v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[8:9], v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v13, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v5, v16 -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v19, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v19, v16 -; GISEL-NEXT: v_mov_b32_e32 v19, s14 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v15 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v18 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc -; GISEL-NEXT: v_addc_u32_e64 v7, vcc, v7, v12, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 -; GISEL-NEXT: v_mul_lo_u32 v13, v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v18, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v15, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, s12, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, s12, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_lo_u32 v12, s12, v5 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v18, s12, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_mul_lo_u32 v9, s12, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, s12, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v6, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v3, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9 -; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s12, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v12 -; GISEL-NEXT: v_subb_u32_e64 v12, s[6:7], v1, v11, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v11 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], 1, v13 -; GISEL-NEXT: v_addc_u32_e64 v11, s[8:9], 0, v16, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7] -; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v14, v9, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], 1, v15 -; GISEL-NEXT: v_addc_u32_e64 v14, s[6:7], 0, v18, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, s12, v2 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v13, v10, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, v15, v12, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v11, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v14, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_udiv_v2i64_oddk_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s8, 0xffed2705 -; CGP-NEXT: s_mov_b32 s12, 0x12d8fb -; CGP-NEXT: s_bfe_i32 s10, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s11, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s13, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s14, -1, 0x10000 -; CGP-NEXT: v_mov_b32_e32 v6, v4 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v6 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mul_lo_u32 v8, s8, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s8, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v13, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v6, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v8 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v19, v5, v9 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v15 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v11, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v12, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v14, s8, v4 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v15, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v16, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v17, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v18, s8, v13 -; CGP-NEXT: v_mul_lo_u32 v19, v13, v15 -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v18 -; CGP-NEXT: v_mul_hi_u32 v18, v5, v15 -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 -; CGP-NEXT: v_mul_lo_u32 v17, v5, v16 -; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v19, v17 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v17, s[6:7], v17, v18 -; CGP-NEXT: v_mul_lo_u32 v17, s8, v10 -; CGP-NEXT: v_mul_lo_u32 v18, v10, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[8:9], v12, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v4, v11 -; CGP-NEXT: v_add_i32_e64 v12, s[8:9], v12, v14 -; CGP-NEXT: v_mul_lo_u32 v14, v4, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v18, v14 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v17 -; CGP-NEXT: v_mov_b32_e32 v14, s10 -; CGP-NEXT: v_mov_b32_e32 v17, s11 -; CGP-NEXT: v_add_i32_e64 v6, s[10:11], v6, v8 -; CGP-NEXT: v_mov_b32_e32 v8, s13 -; CGP-NEXT: v_add_i32_e64 v7, s[10:11], v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v10, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v15, s[8:9], v18, v15 -; CGP-NEXT: v_mul_lo_u32 v18, v10, v12 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v12 -; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v18, v9 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v12, s[8:9], v18, v12 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 -; CGP-NEXT: v_mul_lo_u32 v19, v13, v16 -; CGP-NEXT: v_mul_hi_u32 v13, v13, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v5, v16 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v19, v11 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v19, v16 -; CGP-NEXT: v_mov_b32_e32 v19, s14 -; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v12, v15 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v16, v18 -; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v13, v15 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc -; CGP-NEXT: v_addc_u32_e64 v7, vcc, v7, v12, s[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v0, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_mul_lo_u32 v13, v2, v6 -; CGP-NEXT: v_mul_lo_u32 v15, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v16, v2, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 -; CGP-NEXT: v_mul_lo_u32 v18, v0, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v11, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v12, v0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v15, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v16 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v18, v16 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_lo_u32 v10, s12, v4 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v4 -; CGP-NEXT: v_mul_hi_u32 v15, s12, v4 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v12, s12, v5 -; CGP-NEXT: v_mul_lo_u32 v16, 0, v5 -; CGP-NEXT: v_mul_hi_u32 v18, s12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v9, s12, v6 -; CGP-NEXT: v_mul_lo_u32 v11, s12, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v4 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v6, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v5 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v7, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v3, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s12, v2 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v12 -; CGP-NEXT: v_subb_u32_e64 v12, s[6:7], v1, v11, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v11 -; CGP-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; CGP-NEXT: v_add_i32_e64 v10, s[8:9], 1, v13 -; CGP-NEXT: v_addc_u32_e64 v11, s[8:9], 0, v16, s[8:9] -; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7] -; CGP-NEXT: v_cmp_le_u32_e64 s[6:7], s12, v0 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; CGP-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v12 -; CGP-NEXT: v_cndmask_b32_e64 v9, v14, v9, s[6:7] -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], 1, v15 -; CGP-NEXT: v_addc_u32_e64 v14, s[6:7], 0, v18, s[6:7] -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s12, v2 -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v10, vcc -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; CGP-NEXT: v_cndmask_b32_e64 v0, v15, v12, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v3, v16, v11, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v1, v18, v14, s[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 -; CGP-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_udiv_v2i64_oddk_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x1fb03c31 +; CHECK-NEXT: s_mov_b32 s5, 0xd9528440 +; CHECK-NEXT: v_mul_lo_u32 v4, v1, s4 +; CHECK-NEXT: v_mul_lo_u32 v5, v0, s5 +; CHECK-NEXT: v_mul_hi_u32 v6, v0, s4 +; CHECK-NEXT: v_mul_lo_u32 v7, v1, s5 +; CHECK-NEXT: v_mul_hi_u32 v8, v1, s4 +; CHECK-NEXT: v_mul_hi_u32 v0, v0, s5 +; CHECK-NEXT: v_mul_hi_u32 v1, v1, s5 +; CHECK-NEXT: v_mul_lo_u32 v9, v3, s4 +; CHECK-NEXT: v_mul_lo_u32 v10, v2, s5 +; CHECK-NEXT: v_mul_hi_u32 v11, v2, s4 +; CHECK-NEXT: v_mul_lo_u32 v12, v3, s5 +; CHECK-NEXT: v_mul_hi_u32 v13, v3, s4 +; CHECK-NEXT: v_mul_hi_u32 v2, v2, s5 +; CHECK-NEXT: v_mul_hi_u32 v3, v3, s5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v9 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 20 +; CHECK-NEXT: v_lshr_b64 v[2:3], v[2:3], 20 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i64> %num, ret <2 x i64> %result }