diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4211,9 +4211,11 @@ //===--------------------------------------------------------------------===// // Div utility functions // - SDValue BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization, + SDValue BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterOpLegalization, + bool IsAfterTyLegalization, SmallVectorImpl &Created) const; - SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization, + SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterOpLegalization, + bool IsAfterTyLegalization, SmallVectorImpl &Created) const; /// Targets may override this function to provide custom SDIV lowering for diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -21621,7 +21621,7 @@ return SDValue(); SmallVector Built; - if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, Built)) { + if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, LegalTypes, Built)) { for (SDNode *N : Built) AddToWorklist(N); return S; @@ -21662,7 +21662,7 @@ return SDValue(); SmallVector Built; - if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) { + if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, LegalTypes, Built)) { for (SDNode *N : Built) AddToWorklist(N); return S; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4946,20 +4946,16 @@ /// multiplying by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG, - bool IsAfterLegalization, + bool IsAfterOpLegalization, + bool IsAfterTyLegalization, SmallVectorImpl &Created) const { SDLoc dl(N); EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); - EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()); + EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout(), IsAfterTyLegalization); EVT ShSVT = ShVT.getScalarType(); unsigned EltBits = VT.getScalarSizeInBits(); - // Check to see if we can do this. - // FIXME: We should be more aggressive here. - if (!isTypeLegal(VT)) - return SDValue(); - // If the sdiv has an 'exact' bit we can use a simpler lowering. if (N->getFlags().hasExact()) return BuildExactSDIV(*this, N, dl, DAG, Created); @@ -5017,18 +5013,34 @@ } // Multiply the numerator (operand 0) by the magic value. - // FIXME: We should support doing a MUL in a wider type. SDValue Q; - if (IsAfterLegalization ? isOperationLegal(ISD::MULHS, VT) - : isOperationLegalOrCustom(ISD::MULHS, VT)) + if (IsAfterOpLegalization ? isOperationLegal(ISD::MULHS, VT) + : isOperationLegalOrCustom(ISD::MULHS, VT)) Q = DAG.getNode(ISD::MULHS, dl, VT, N0, MagicFactor); - else if (IsAfterLegalization ? isOperationLegal(ISD::SMUL_LOHI, VT) - : isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) { + else if (IsAfterOpLegalization + ? isOperationLegal(ISD::SMUL_LOHI, VT) + : isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) { SDValue LoHi = DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), N0, MagicFactor); Q = SDValue(LoHi.getNode(), 1); - } else - return SDValue(); // No mulhs or equivalent. + } else { + if (IsAfterTyLegalization) + return SDValue(); + // No MULHU or UMUL_LOHI. Multiply in a wider integer and take the upper + // part. Even if targets do not support the other two instructions well, + // they are pretty good at supporting plain `MUL` at any width. + EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), EltBits * 2); + if (VT.isVector()) + WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, + VT.getVectorNumElements()); + SDValue DoubleMul = DAG.getNode( + ISD::MUL, dl, WideVT, DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, N0), + DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, MagicFactor)); + SDValue Upper = DAG.getNode(ISD::SRL, dl, WideVT, DoubleMul, + DAG.getConstant(EltBits, dl, WideVT)); + SDValue Hi = DAG.getNode(ISD::TRUNCATE, dl, VT, Upper); + Q = Hi; + } Created.push_back(Q.getNode()); // (Optionally) Add/subtract the numerator using Factor. @@ -5055,20 +5067,16 @@ /// multiplying by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, - bool IsAfterLegalization, + bool IsAfterOpLegalization, + bool IsAfterTyLegalization, SmallVectorImpl &Created) const { SDLoc dl(N); EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); - EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()); + EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout(), IsAfterTyLegalization); EVT ShSVT = ShVT.getScalarType(); unsigned EltBits = VT.getScalarSizeInBits(); - // Check to see if we can do this. - // FIXME: We should be more aggressive here. - if (!isTypeLegal(VT)) - return SDValue(); - bool UseNPQ = false; SmallVector PreShifts, PostShifts, MagicFactors, NPQFactors; @@ -5137,18 +5145,32 @@ Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift); Created.push_back(Q.getNode()); - // FIXME: We should support doing a MUL in a wider type. auto GetMULHU = [&](SDValue X, SDValue Y) { - if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT) - : isOperationLegalOrCustom(ISD::MULHU, VT)) + if (IsAfterOpLegalization ? isOperationLegal(ISD::MULHU, VT) + : isOperationLegalOrCustom(ISD::MULHU, VT)) return DAG.getNode(ISD::MULHU, dl, VT, X, Y); - if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) - : isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) { + if (IsAfterOpLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) + : isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) { SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y); return SDValue(LoHi.getNode(), 1); } - return SDValue(); // No mulhu or equivalent + if (IsAfterTyLegalization) + return SDValue(); + // No MULHU or UMUL_LOHI. Multiply in a wider integer and take the upper + // part. Even if targets do not support the other two instructions well, + // they are pretty good at supporting plain `MUL` at any width. + EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), EltBits * 2); + if (VT.isVector()) + WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, + VT.getVectorNumElements()); + SDValue DoubleMul = DAG.getNode( + ISD::MUL, dl, WideVT, DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, X), + DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, Y)); + SDValue Upper = DAG.getNode(ISD::SRL, dl, WideVT, DoubleMul, + DAG.getConstant(EltBits, dl, WideVT)); + SDValue Hi = DAG.getNode(ISD::TRUNCATE, dl, VT, Upper); + return Hi; }; // Multiply the numerator (operand 0) by the magic value. diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -83,13 +83,10 @@ define i16 @test_srem_even(i16 %X) nounwind { ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #9363 ; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: movk w9, #37449, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: asr w9, w8, #3 +; CHECK-NEXT: mov w9, #18725 +; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: asr w9, w8, #18 ; CHECK-NEXT: add w8, w9, w8, lsr #31 ; CHECK-NEXT: mov w9, #14 ; CHECK-NEXT: msub w8, w8, w9, w0 diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll --- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -4,50 +4,21 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; CHECK-LABEL: fold_srem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #63421 -; CHECK-NEXT: mov w12, #33437 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: movk w9, #31710, lsl #16 -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: movk w12, #21399, lsl #16 -; CHECK-NEXT: smull x12, w11, w12 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x13, x12, #63 -; CHECK-NEXT: asr x12, x12, #37 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w12, w12, w13 -; CHECK-NEXT: mov w13, #98 -; CHECK-NEXT: sub w9, w9, w8 -; CHECK-NEXT: msub w11, w12, w13, w11 -; CHECK-NEXT: asr w13, w9, #6 -; CHECK-NEXT: add w9, w13, w9, lsr #31 -; CHECK-NEXT: mov w13, #37253 -; CHECK-NEXT: mov w10, #-124 -; CHECK-NEXT: smov w12, v0.h[0] -; CHECK-NEXT: movk w13, #44150, lsl #16 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: smull x10, w12, w13 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w12 -; CHECK-NEXT: asr w13, w10, #6 -; CHECK-NEXT: mov w9, #95 -; CHECK-NEXT: add w10, w13, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w9, w12 -; CHECK-NEXT: mov w10, #63249 -; CHECK-NEXT: smov w13, v0.h[3] -; CHECK-NEXT: movk w10, #48808, lsl #16 -; CHECK-NEXT: smull x10, w13, w10 -; CHECK-NEXT: lsr x12, x10, #63 -; CHECK-NEXT: asr x10, x10, #40 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add w10, w10, w12 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #-1003 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w10, w8, w13 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: adrp x8, .LCPI0_3 +; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: mla v2.4h, v0.4h, v1.4h +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: neg v3.4h, v3.4h +; CHECK-NEXT: sshl v2.4h, v2.4h, v3.4h +; CHECK-NEXT: usra v2.4h, v2.4h, #15 +; CHECK-NEXT: mls v0.4h, v2.4h, v1.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -56,43 +27,15 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; CHECK-LABEL: fold_srem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #37253 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: movk w9, #44150, lsl #16 -; CHECK-NEXT: smov w10, v0.h[0] -; CHECK-NEXT: smull x13, w8, w9 -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: smull x14, w10, w9 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: smull x15, w11, w9 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: add w13, w13, w8 -; CHECK-NEXT: smull x9, w12, w9 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w14, w14, w10 -; CHECK-NEXT: asr w16, w13, #6 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w15, w15, w11 -; CHECK-NEXT: add w13, w16, w13, lsr #31 -; CHECK-NEXT: asr w16, w14, #6 -; CHECK-NEXT: add w9, w9, w12 -; CHECK-NEXT: add w14, w16, w14, lsr #31 -; CHECK-NEXT: asr w16, w15, #6 -; CHECK-NEXT: add w15, w16, w15, lsr #31 -; CHECK-NEXT: asr w16, w9, #6 -; CHECK-NEXT: add w9, w16, w9, lsr #31 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: msub w10, w14, w16, w10 -; CHECK-NEXT: msub w8, w13, w16, w8 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: msub w11, w15, w16, w11 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w9, w16, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov w8, #44151 +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: add v1.4h, v1.4h, v0.4h +; CHECK-NEXT: sshr v1.4h, v1.4h, #6 +; CHECK-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-NEXT: movi v2.4h, #95 +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -103,47 +46,16 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; CHECK-LABEL: combine_srem_sdiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #37253 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: movk w8, #44150, lsl #16 -; CHECK-NEXT: smov w9, v0.h[1] -; CHECK-NEXT: smov w10, v0.h[0] -; CHECK-NEXT: smull x13, w9, w8 -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: smull x14, w10, w8 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: smull x15, w11, w8 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: add w13, w13, w9 -; CHECK-NEXT: smull x8, w12, w8 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w14, w14, w10 -; CHECK-NEXT: asr w16, w13, #6 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: add w15, w15, w11 -; CHECK-NEXT: add w13, w16, w13, lsr #31 -; CHECK-NEXT: asr w16, w14, #6 -; CHECK-NEXT: add w8, w8, w12 -; CHECK-NEXT: add w14, w16, w14, lsr #31 -; CHECK-NEXT: asr w16, w15, #6 -; CHECK-NEXT: add w15, w16, w15, lsr #31 -; CHECK-NEXT: asr w16, w8, #6 -; CHECK-NEXT: add w8, w16, w8, lsr #31 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: msub w10, w14, w16, w10 -; CHECK-NEXT: msub w9, w13, w16, w9 -; CHECK-NEXT: fmov s0, w14 -; CHECK-NEXT: fmov s1, w10 -; CHECK-NEXT: msub w11, w15, w16, w11 -; CHECK-NEXT: mov v0.h[1], w13 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: msub w12, w8, w16, w12 -; CHECK-NEXT: mov v0.h[2], w15 -; CHECK-NEXT: mov v1.h[2], w11 -; CHECK-NEXT: mov v1.h[3], w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: mov w8, #44151 +; CHECK-NEXT: dup v2.4h, w8 +; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: add v2.4h, v2.4h, v0.4h +; CHECK-NEXT: sshr v2.4h, v2.4h, #6 +; CHECK-NEXT: movi v1.4h, #95 +; CHECK-NEXT: usra v2.4h, v2.4h, #15 +; CHECK-NEXT: mls v0.4h, v2.4h, v1.4h +; CHECK-NEXT: add v0.4h, v0.4h, v2.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, @@ -155,40 +67,19 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_power_of_two: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: add w12, w8, #31 // =31 -; CHECK-NEXT: cmp w8, #0 // =0 -; CHECK-NEXT: mov w11, #37253 -; CHECK-NEXT: csel w12, w12, w8, lt -; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: smov w10, v0.h[3] -; CHECK-NEXT: movk w11, #44150, lsl #16 -; CHECK-NEXT: and w12, w12, #0xffffffe0 -; CHECK-NEXT: sub w8, w8, w12 -; CHECK-NEXT: add w12, w9, #63 // =63 -; CHECK-NEXT: smull x11, w10, w11 -; CHECK-NEXT: cmp w9, #0 // =0 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: csel w12, w12, w9, lt -; CHECK-NEXT: add w11, w11, w10 -; CHECK-NEXT: and w12, w12, #0xffffffc0 -; CHECK-NEXT: sub w9, w9, w12 -; CHECK-NEXT: asr w12, w11, #6 -; CHECK-NEXT: add w11, w12, w11, lsr #31 -; CHECK-NEXT: smov w12, v0.h[2] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add w9, w12, #7 // =7 -; CHECK-NEXT: cmp w12, #0 // =0 -; CHECK-NEXT: csel w9, w9, w12, lt -; CHECK-NEXT: and w9, w9, #0xfffffff8 -; CHECK-NEXT: sub w9, w12, w9 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #95 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w11, w8, w10 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x9, .LCPI3_1 +; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: add v1.4h, v1.4h, v0.4h +; CHECK-NEXT: neg v2.4h, v2.4h +; CHECK-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -198,41 +89,24 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #17097 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[2] -; CHECK-NEXT: movk w9, #45590, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w9, w9, w8 -; CHECK-NEXT: asr w12, w9, #4 -; CHECK-NEXT: add w9, w12, w9, lsr #31 -; CHECK-NEXT: mov w12, #30865 -; CHECK-NEXT: mov w10, #23 -; CHECK-NEXT: smov w11, v0.h[1] -; CHECK-NEXT: movk w12, #51306, lsl #16 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: smull x10, w11, w12 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: asr w12, w10, #9 -; CHECK-NEXT: mov w9, #654 -; CHECK-NEXT: add w10, w12, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w9, w11 -; CHECK-NEXT: mov w10, #47143 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: movk w10, #24749, lsl #16 -; CHECK-NEXT: smull x10, w12, w10 -; CHECK-NEXT: lsr x11, x10, #63 -; CHECK-NEXT: asr x10, x10, #43 -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: mov w9, #5423 -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: msub w8, w10, w9, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h +; CHECK-NEXT: and v1.8b, v0.8b, v1.8b +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: neg v3.4h, v3.4h +; CHECK-NEXT: add v1.4h, v2.4h, v1.4h +; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: sshl v1.4h, v1.4h, v3.4h +; CHECK-NEXT: ushr v2.4h, v1.4h, #15 +; CHECK-NEXT: movi d3, #0xffffffffffff0000 +; CHECK-NEXT: and v2.8b, v2.8b, v3.8b +; CHECK-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-NEXT: mls v0.4h, v1.4h, v4.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -242,38 +116,24 @@ define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_i16_smax: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w10, #17097 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w9, v0.h[2] -; CHECK-NEXT: movk w10, #45590, lsl #16 -; CHECK-NEXT: smull x10, w9, w10 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w9 -; CHECK-NEXT: asr w12, w10, #4 -; CHECK-NEXT: mov w11, #23 -; CHECK-NEXT: add w10, w12, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w11, w9 -; CHECK-NEXT: mov w10, #47143 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: movk w10, #24749, lsl #16 -; CHECK-NEXT: smull x10, w12, w10 -; CHECK-NEXT: lsr x11, x10, #63 -; CHECK-NEXT: asr x10, x10, #43 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: mov w11, #32767 -; CHECK-NEXT: add w11, w8, w11 -; CHECK-NEXT: cmp w8, #0 // =0 -; CHECK-NEXT: csel w11, w11, w8, lt -; CHECK-NEXT: and w11, w11, #0xffff8000 -; CHECK-NEXT: sub w8, w8, w11 -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #5423 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w10, w8, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: adrp x8, .LCPI5_1 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI5_1] +; CHECK-NEXT: adrp x8, .LCPI5_2 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI5_2] +; CHECK-NEXT: adrp x8, .LCPI5_3 +; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: neg v3.4h, v3.4h +; CHECK-NEXT: mla v2.4h, v0.4h, v1.4h +; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI5_3] +; CHECK-NEXT: sshl v1.4h, v2.4h, v3.4h +; CHECK-NEXT: ushr v2.4h, v1.4h, #15 +; CHECK-NEXT: movi d3, #0xffffffffffff0000 +; CHECK-NEXT: and v2.8b, v2.8b, v3.8b +; CHECK-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-NEXT: mls v0.4h, v1.4h, v4.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -283,41 +143,64 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; CHECK-LABEL: dont_fold_srem_i64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #21445 +; CHECK-NEXT: movk x9, #1603, lsl #16 +; CHECK-NEXT: movk x9, #15432, lsl #32 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: movk x9, #25653, lsl #48 +; CHECK-NEXT: smulh x8, x8, x9 ; CHECK-NEXT: mov x9, #6055 ; CHECK-NEXT: movk x9, #58853, lsl #16 ; CHECK-NEXT: movk x9, #47142, lsl #32 -; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: mov x11, v1.d[1] ; CHECK-NEXT: movk x9, #24749, lsl #48 -; CHECK-NEXT: smulh x9, x8, x9 -; CHECK-NEXT: asr x12, x9, #11 +; CHECK-NEXT: smulh x9, x11, x9 +; CHECK-NEXT: mov x11, #8549 +; CHECK-NEXT: movk x11, #22795, lsl #16 +; CHECK-NEXT: adrp x10, .LCPI6_0 +; CHECK-NEXT: movk x11, #17096, lsl #32 +; CHECK-NEXT: ldr q2, [x10, :lo12:.LCPI6_0] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: movk x11, #45590, lsl #48 +; CHECK-NEXT: smulh x10, x10, x11 +; CHECK-NEXT: adrp x11, .LCPI6_3 +; CHECK-NEXT: ldr q3, [x11, :lo12:.LCPI6_3] +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: fmov d5, x8 +; CHECK-NEXT: mov v4.d[1], v5.d[0] +; CHECK-NEXT: fmov d5, x9 +; CHECK-NEXT: fmov d6, x10 +; CHECK-NEXT: mov v6.d[1], v5.d[0] +; CHECK-NEXT: and v5.16b, v1.16b, v2.16b +; CHECK-NEXT: add v5.2d, v6.2d, v5.2d +; CHECK-NEXT: neg v3.2d, v3.2d +; CHECK-NEXT: adrp x8, .LCPI6_1 +; CHECK-NEXT: sshl v3.2d, v5.2d, v3.2d +; CHECK-NEXT: usra v3.2d, v5.2d, #63 +; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI6_1] +; CHECK-NEXT: adrp x9, .LCPI6_2 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI6_2] +; CHECK-NEXT: and v2.16b, v0.16b, v2.16b +; CHECK-NEXT: neg v5.2d, v5.2d +; CHECK-NEXT: add v2.2d, v4.2d, v2.2d +; CHECK-NEXT: sshl v4.2d, v2.2d, v5.2d +; CHECK-NEXT: ushr v2.2d, v2.2d, #63 +; CHECK-NEXT: mov w9, #23 +; CHECK-NEXT: and v2.16b, v2.16b, v3.16b +; CHECK-NEXT: mul x9, x10, x9 ; CHECK-NEXT: mov w10, #5423 -; CHECK-NEXT: add x9, x12, x9, lsr #63 -; CHECK-NEXT: msub x8, x9, x10, x8 -; CHECK-NEXT: mov x9, #21445 -; CHECK-NEXT: movk x9, #1603, lsl #16 -; CHECK-NEXT: movk x9, #15432, lsl #32 -; CHECK-NEXT: mov x12, v0.d[1] -; CHECK-NEXT: movk x9, #25653, lsl #48 -; CHECK-NEXT: smulh x9, x12, x9 -; CHECK-NEXT: asr x10, x9, #8 -; CHECK-NEXT: add x9, x10, x9, lsr #63 +; CHECK-NEXT: add v2.2d, v4.2d, v2.2d +; CHECK-NEXT: mul x8, x8, x10 ; CHECK-NEXT: mov w10, #654 -; CHECK-NEXT: msub x9, x9, x10, x12 -; CHECK-NEXT: mov x10, #8549 -; CHECK-NEXT: movk x10, #22795, lsl #16 -; CHECK-NEXT: movk x10, #17096, lsl #32 -; CHECK-NEXT: fmov x11, d1 -; CHECK-NEXT: movk x10, #45590, lsl #48 -; CHECK-NEXT: smulh x10, x11, x10 -; CHECK-NEXT: add x10, x10, x11 -; CHECK-NEXT: asr x12, x10, #4 -; CHECK-NEXT: add x10, x12, x10, lsr #63 -; CHECK-NEXT: mov w12, #23 -; CHECK-NEXT: msub x10, x10, x12, x11 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: fmov d1, x10 -; CHECK-NEXT: mov v1.d[1], x8 -; CHECK-NEXT: mov v0.d[1], x9 +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: mov x9, v2.d[1] +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov v3.d[1], x8 +; CHECK-NEXT: mov v2.d[1], x9 +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-NEXT: sub v1.2d, v1.2d, v3.2d ; CHECK-NEXT: ret %1 = srem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll @@ -195,14 +195,15 @@ define i1 @t16_3_2(i16 %X) nounwind { ; CHECK-LABEL: t16_3_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #43691 ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: mov w10, #-1431655766 -; CHECK-NEXT: madd w8, w8, w9, w10 -; CHECK-NEXT: mov w9, #1431655765 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w9, #43691 +; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: lsr w8, w8, #17 +; CHECK-NEXT: add w8, w8, w8, lsl #1 +; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: and w8, w8, #0xffff +; CHECK-NEXT: cmp w8, #2 // =2 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i16 %X, 3 %cmp = icmp eq i16 %urem, 2 @@ -212,14 +213,15 @@ define i1 @t8_3_2(i8 %X) nounwind { ; CHECK-LABEL: t8_3_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #43691 ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: mov w10, #-1431655766 -; CHECK-NEXT: madd w8, w8, w9, w10 -; CHECK-NEXT: mov w9, #1431655765 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w9, #171 +; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: lsr w8, w8, #9 +; CHECK-NEXT: add w8, w8, w8, lsl #1 +; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: and w8, w8, #0xff +; CHECK-NEXT: cmp w8, #2 // =2 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i8 %X, 3 %cmp = icmp eq i8 %urem, 2 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq.ll b/llvm/test/CodeGen/AArch64/urem-seteq.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq.ll @@ -78,15 +78,14 @@ define i16 @test_urem_even(i16 %X) nounwind { ; CHECK-LABEL: test_urem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #28087 -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: movk w9, #46811, lsl #16 +; CHECK-NEXT: ubfx w8, w0, #1, #15 +; CHECK-NEXT: mov w9, #18725 ; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: lsr w8, w8, #17 +; CHECK-NEXT: mov w9, #14 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: tst w8, #0xffff +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %urem = urem i16 %X, 14 %cmp = icmp ne i16 %urem, 0 diff --git a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll --- a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -4,44 +4,27 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w11, #33437 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: movk w11, #21399, lsl #16 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: mov w9, #16913 -; CHECK-NEXT: mov w12, #98 -; CHECK-NEXT: lsr x11, x11, #37 -; CHECK-NEXT: movk w9, #8456, lsl #16 -; CHECK-NEXT: msub w10, w11, w12, w10 -; CHECK-NEXT: ubfx w12, w8, #2, #14 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: mov w11, #124 -; CHECK-NEXT: lsr x9, x9, #34 -; CHECK-NEXT: msub w8, w9, w11, w8 -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: umov w12, v0.h[0] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w11, w12, w9 -; CHECK-NEXT: add w9, w9, w11, lsr #1 -; CHECK-NEXT: mov w11, #95 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: msub w9, w9, w11, w12 -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov w9, #2287 -; CHECK-NEXT: movk w9, #16727, lsl #16 -; CHECK-NEXT: umull x9, w11, w9 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #1003 -; CHECK-NEXT: lsr x9, x9, #40 -; CHECK-NEXT: mov v0.h[2], w10 -; CHECK-NEXT: msub w8, w9, w8, w11 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: neg v1.4h, v1.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: ushl v1.4h, v0.4h, v1.4h +; CHECK-NEXT: adrp x8, .LCPI0_3 +; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: adrp x8, .LCPI0_4 +; CHECK-NEXT: sub v2.4h, v0.4h, v1.4h +; CHECK-NEXT: umull v2.4s, v2.4h, v3.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI0_4] +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: add v1.4h, v2.4h, v1.4h +; CHECK-NEXT: neg v2.4h, v4.4h +; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h +; CHECK-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -50,43 +33,13 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umov w10, v0.h[0] -; CHECK-NEXT: umull x13, w8, w9 -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: umull x14, w10, w9 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: umov w12, v0.h[3] -; CHECK-NEXT: umull x15, w11, w9 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: sub w16, w8, w13 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w13, w13, w16, lsr #1 -; CHECK-NEXT: sub w16, w10, w14 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w14, w14, w16, lsr #1 -; CHECK-NEXT: sub w16, w11, w15 -; CHECK-NEXT: add w15, w15, w16, lsr #1 -; CHECK-NEXT: sub w16, w12, w9 -; CHECK-NEXT: add w9, w9, w16, lsr #1 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: lsr w13, w13, #6 -; CHECK-NEXT: msub w8, w13, w16, w8 -; CHECK-NEXT: lsr w13, w14, #6 -; CHECK-NEXT: msub w10, w13, w16, w10 -; CHECK-NEXT: lsr w13, w15, #6 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: msub w11, w13, w16, w11 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w9, w16, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov w8, #44151 +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ushr v1.4s, v1.4s, #22 +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: movi v2.4h, #95 +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -97,47 +50,14 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; CHECK-LABEL: combine_urem_udiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8969 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: movk w8, #22765, lsl #16 -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[0] -; CHECK-NEXT: umull x13, w9, w8 -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: umull x14, w10, w8 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: umov w12, v0.h[3] -; CHECK-NEXT: umull x15, w11, w8 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: sub w16, w9, w13 -; CHECK-NEXT: umull x8, w12, w8 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w13, w13, w16, lsr #1 -; CHECK-NEXT: sub w16, w10, w14 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: add w14, w14, w16, lsr #1 -; CHECK-NEXT: sub w16, w11, w15 -; CHECK-NEXT: add w15, w15, w16, lsr #1 -; CHECK-NEXT: sub w16, w12, w8 -; CHECK-NEXT: add w8, w8, w16, lsr #1 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: lsr w14, w14, #6 -; CHECK-NEXT: lsr w13, w13, #6 -; CHECK-NEXT: msub w10, w14, w16, w10 -; CHECK-NEXT: lsr w15, w15, #6 -; CHECK-NEXT: msub w9, w13, w16, w9 -; CHECK-NEXT: fmov s0, w14 -; CHECK-NEXT: fmov s1, w10 -; CHECK-NEXT: lsr w8, w8, #6 -; CHECK-NEXT: msub w11, w15, w16, w11 -; CHECK-NEXT: mov v0.h[1], w13 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: msub w12, w8, w16, w12 -; CHECK-NEXT: mov v0.h[2], w15 -; CHECK-NEXT: mov v1.h[2], w11 -; CHECK-NEXT: mov v1.h[3], w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: mov w8, #44151 +; CHECK-NEXT: dup v2.4h, w8 +; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h +; CHECK-NEXT: ushr v2.4s, v2.4s, #22 +; CHECK-NEXT: movi v1.4h, #95 +; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: mls v0.4h, v2.4h, v1.4h +; CHECK-NEXT: add v0.4h, v0.4h, v2.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, @@ -150,28 +70,17 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_urem_power_of_two: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w10, w8, w9 -; CHECK-NEXT: add w9, w9, w10, lsr #1 -; CHECK-NEXT: mov w10, #95 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: and w9, w9, #0x3f -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: and w10, w10, #0x1f -; CHECK-NEXT: and w9, w9, #0x7 -; CHECK-NEXT: mov v1.h[1], w10 -; CHECK-NEXT: mov v1.h[2], w9 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: adrp x9, .LCPI3_1 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: neg v2.4h, v2.4h +; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h +; CHECK-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -181,34 +90,28 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #17097 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: movk w9, #45590, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: mov w10, #23 -; CHECK-NEXT: lsr x9, x9, #36 -; CHECK-NEXT: umov w11, v0.h[1] -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: mov w9, #30865 -; CHECK-NEXT: movk w9, #51306, lsl #16 -; CHECK-NEXT: ubfx w10, w11, #1, #15 -; CHECK-NEXT: umull x9, w10, w9 -; CHECK-NEXT: mov w10, #654 -; CHECK-NEXT: lsr x9, x9, #40 -; CHECK-NEXT: msub w9, w9, w10, w11 -; CHECK-NEXT: mov w11, #47143 -; CHECK-NEXT: umov w10, v0.h[3] -; CHECK-NEXT: movk w11, #24749, lsl #16 -; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: mov w9, #5423 -; CHECK-NEXT: lsr x11, x11, #43 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: msub w8, w11, w9, w10 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h +; CHECK-NEXT: umull v2.4s, v3.4h, v2.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: add v1.4h, v2.4h, v1.4h +; CHECK-NEXT: adrp x8, .LCPI4_3 +; CHECK-NEXT: neg v3.4h, v3.4h +; CHECK-NEXT: movi d2, #0xffffffffffff0000 +; CHECK-NEXT: ushl v1.4h, v1.4h, v3.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_3] +; CHECK-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-NEXT: movi d2, #0x0000000000ffff +; CHECK-NEXT: and v2.8b, v0.8b, v2.8b +; CHECK-NEXT: orr v1.8b, v2.8b, v1.8b +; CHECK-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -227,40 +130,62 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { ; CHECK-LABEL: dont_fold_urem_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x10, #12109 -; CHECK-NEXT: movk x10, #52170, lsl #16 -; CHECK-NEXT: movk x10, #28749, lsl #32 +; CHECK-NEXT: mov x9, #12109 +; CHECK-NEXT: movk x9, #52170, lsl #16 +; CHECK-NEXT: movk x9, #28749, lsl #32 ; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: movk x10, #49499, lsl #48 -; CHECK-NEXT: umulh x10, x8, x10 -; CHECK-NEXT: mov w11, #5423 -; CHECK-NEXT: lsr x10, x10, #12 -; CHECK-NEXT: msub x8, x10, x11, x8 +; CHECK-NEXT: movk x9, #49499, lsl #48 +; CHECK-NEXT: umulh x8, x8, x9 +; CHECK-NEXT: mov x9, #17097 +; CHECK-NEXT: movk x9, #45590, lsl #16 +; CHECK-NEXT: movk x9, #34192, lsl #32 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: movk x9, #25644, lsl #48 +; CHECK-NEXT: ushr v3.2d, v0.2d, #1 +; CHECK-NEXT: umulh x9, x10, x9 ; CHECK-NEXT: mov x10, #21445 +; CHECK-NEXT: fmov d4, x8 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: fmov d3, x9 ; CHECK-NEXT: movk x10, #1603, lsl #16 -; CHECK-NEXT: mov x12, v0.d[1] +; CHECK-NEXT: mov v3.d[1], v4.d[0] ; CHECK-NEXT: movk x10, #15432, lsl #32 +; CHECK-NEXT: sub v4.2d, v1.2d, v3.2d ; CHECK-NEXT: movk x10, #25653, lsl #48 -; CHECK-NEXT: lsr x11, x12, #1 -; CHECK-NEXT: umulh x10, x11, x10 -; CHECK-NEXT: mov w11, #654 -; CHECK-NEXT: lsr x10, x10, #7 -; CHECK-NEXT: msub x10, x10, x11, x12 -; CHECK-NEXT: mov x11, #17097 -; CHECK-NEXT: movk x11, #45590, lsl #16 -; CHECK-NEXT: movk x11, #34192, lsl #32 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: movk x11, #25644, lsl #48 -; CHECK-NEXT: umulh x11, x9, x11 -; CHECK-NEXT: sub x12, x9, x11 -; CHECK-NEXT: add x11, x11, x12, lsr #1 -; CHECK-NEXT: mov w12, #23 -; CHECK-NEXT: lsr x11, x11, #4 -; CHECK-NEXT: msub x9, x11, x12, x9 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: mov v1.d[1], x8 -; CHECK-NEXT: mov v0.d[1], x10 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: umulh x8, x8, x10 +; CHECK-NEXT: lsr x9, x9, #1 +; CHECK-NEXT: fmov d2, xzr +; CHECK-NEXT: adrp x10, .LCPI6_0 +; CHECK-NEXT: fmov d4, x8 +; CHECK-NEXT: fmov d5, x9 +; CHECK-NEXT: mov v5.d[1], v2.d[0] +; CHECK-NEXT: mov v2.d[1], v4.d[0] +; CHECK-NEXT: ldr q4, [x10, :lo12:.LCPI6_0] +; CHECK-NEXT: add v3.2d, v5.2d, v3.2d +; CHECK-NEXT: adrp x8, .LCPI6_1 +; CHECK-NEXT: adrp x9, .LCPI6_2 +; CHECK-NEXT: neg v4.2d, v4.2d +; CHECK-NEXT: ushl v3.2d, v3.2d, v4.2d +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI6_1] +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI6_2] +; CHECK-NEXT: neg v4.2d, v4.2d +; CHECK-NEXT: mov w9, #23 +; CHECK-NEXT: ushl v2.2d, v2.2d, v4.2d +; CHECK-NEXT: mul x9, x10, x9 +; CHECK-NEXT: mov w10, #5423 +; CHECK-NEXT: bit v2.16b, v0.16b, v3.16b +; CHECK-NEXT: mul x8, x8, x10 +; CHECK-NEXT: mov w10, #654 +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: mov x9, v2.d[1] +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov v3.d[1], x8 +; CHECK-NEXT: mov v2.d[1], x9 +; CHECK-NEXT: sub v1.2d, v1.2d, v3.2d +; CHECK-NEXT: sub v0.2d, v0.2d, v2.2d ; CHECK-NEXT: ret %1 = urem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -1,5 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s @@ -4926,127 +4925,34 @@ ; ; GCN-LABEL: udiv_i64_oddk_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f176a73 -; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GCN-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s2, 0xfee0 -; GCN-NEXT: s_mov_b32 s3, 0x68958c89 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 -; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, 0x38f83e5 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: s_movk_i32 s4, 0x11e -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 -; GCN-NEXT: s_mov_b32 s2, 0x976a7377 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: s_movk_i32 s3, 0x11f -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s5, 0x64c139ef +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s7, v1 +; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-NEXT: s_mul_i32 s5, s7, s5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s5, v4 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mul_i32 s4, s6, s8 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v0, s3 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s2 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, v0, s2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 -; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v4 -; GCN-NEXT: s_mov_b32 s2, 0x976a7376 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v5 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] -; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 -; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] -; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 -; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc +; GCN-NEXT: s_mul_i32 s4, s7, s8 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v2 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v0, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 2, v0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = udiv i64 %x, 1235195949943 store i64 %r, i64 addrspace(1)* %out @@ -5149,113 +5055,40 @@ ; ; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s6, 0xf001 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_movk_i32 s0, 0xfff -; GCN-NEXT: v_mul_hi_u32 v3, v0, s6 -; GCN-NEXT: v_mul_lo_u32 v5, v1, s6 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s6 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v8, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_mul_hi_u32 v5, v0, s6 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v1, v4, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v6, v3, s6 -; GCN-NEXT: v_mul_lo_u32 v8, v0, s6 -; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v8 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v7, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v3, v8 -; GCN-NEXT: v_mul_hi_u32 v8, v3, v8 -; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s9, 0x10010011 +; GCN-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NEXT: s_mov_b32 s10, 0x100100 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v3, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v6, s11, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, s0 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s0 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v4 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc -; GCN-NEXT: s_movk_i32 s0, 0xffe -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s3, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s2, v0 +; GCN-NEXT: s_mul_i32 s9, s3, s9 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s9, v3 +; GCN-NEXT: s_mul_i32 s8, s2, s10 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, s8, v3 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc +; GCN-NEXT: s_mul_i32 s8, s3, s10 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s8, v1 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v3 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 +; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GCN-NEXT: v_lshr_b64 v[2:3], v[0:1], 11 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %r = udiv <2 x i64> %x, @@ -5309,126 +5142,41 @@ ; ; GCN-LABEL: urem_i64_oddk_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 -; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GCN-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s2, 0xfee0 -; GCN-NEXT: s_mov_b32 s3, 0x689e0837 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 -; GCN-NEXT: s_movk_i32 s12, 0x11f -; GCN-NEXT: s_mov_b32 s13, 0x9761f7c9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: s_mov_b32 s10, 0xe3e10011 +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: s_movk_i32 s5, 0x11e -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: s_mov_b32 s4, 0x9761f7c8 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s4, 0xf6841139 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s7, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: s_mul_i32 s4, s7, s4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; GCN-NEXT: s_mul_i32 s1, s6, s10 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, s1, v3 ; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v0, s12 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 -; GCN-NEXT: v_mul_lo_u32 v1, v1, s13 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s13 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s12 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s13, v0 -; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v4 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s13, v4 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, v5 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] -; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s12, v1 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: s_mul_i32 s1, s7, s10 +; GCN-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, s1, v1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v0, v2, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GCN-NEXT: s_mov_b32 s4, 0x9761f7c9 +; GCN-NEXT: v_mul_hi_u32 v1, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v2, v0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_movk_i32 s5, 0x11f +; GCN-NEXT: v_mad_u32_u24 v1, v0, s5, v1 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v2 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = urem i64 %x, 1235195393993 store i64 %r, i64 addrspace(1)* %out @@ -5575,122 +5323,44 @@ ; ; GCN-LABEL: sdiv_i64_oddk_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s2, 0xffed2705 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v2, v1, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s9, 0xfd81e19 +; GCN-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NEXT: s_mov_b32 s8, 0x6ca94220 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc -; GCN-NEXT: s_mov_b32 s5, s9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v4, v2, s2 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: s_ashr_i32 s2, s11, 31 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GCN-NEXT: s_add_u32 s0, s10, s2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_mov_b32 s3, s2 -; GCN-NEXT: s_addc_u32 s1, s11, s2 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s7, v2 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: s_mul_i32 s5, s7, s9 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s5, v3 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mul_i32 s4, s6, s8 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s1, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 -; GCN-NEXT: s_mov_b32 s3, 0x12d8fb -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, s3 -; GCN-NEXT: v_mul_hi_u32 v3, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s3, v4 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc -; GCN-NEXT: s_mov_b32 s0, 0x12d8fa -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GCN-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc +; GCN-NEXT: s_ashr_i32 s5, s7, 31 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v2, s5, v2 +; GCN-NEXT: s_mul_i32 s4, s7, s8 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s4, v1 +; GCN-NEXT: s_mul_i32 s4, s5, s8 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GCN-NEXT: s_mul_i32 s5, s5, s9 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s5, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s5, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GCN-NEXT: v_ashr_i64 v[2:3], v[0:1], 19 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = sdiv i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out @@ -5930,128 +5600,57 @@ ; ; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x457ff000 -; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GCN-NEXT: v_mac_f32_e32 v0, 0, v1 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s6, 0xf001 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s10, 0x8008009 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: s_mov_b32 s8, 0x80080080 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s0, s9, 31 -; GCN-NEXT: s_lshr_b32 s0, s0, 20 -; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s6 -; GCN-NEXT: s_add_u32 s2, s8, s0 -; GCN-NEXT: s_addc_u32 s3, s9, 0 -; GCN-NEXT: s_ashr_i32 s8, s11, 31 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v0, s6 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 -; GCN-NEXT: s_mov_b32 s9, s8 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v5, v2, s6 -; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GCN-NEXT: v_mul_lo_u32 v7, v0, s6 -; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 -; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc +; GCN-NEXT: v_mul_hi_u32 v4, s2, v2 +; GCN-NEXT: v_mul_hi_u32 v3, s3, v2 +; GCN-NEXT: s_ashr_i32 s9, s1, 31 +; GCN-NEXT: v_mul_hi_u32 v1, s2, v0 +; GCN-NEXT: s_mul_i32 s11, s3, s10 +; GCN-NEXT: s_lshr_b32 s9, s9, 20 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s11, v4 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_mul_i32 s9, s2, s8 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s9, v4 +; GCN-NEXT: s_ashr_i32 s11, s3, 31 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-NEXT: s_mul_i32 s9, s3, s8 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v5, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, s9, v1 +; GCN-NEXT: s_mul_i32 s8, s11, s8 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, s8, v2 +; GCN-NEXT: s_mul_i32 s8, s11, s10 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s8, v2 +; GCN-NEXT: v_mov_b32_e32 v3, s8 +; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 +; GCN-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] -; GCN-NEXT: s_add_u32 s0, s10, s8 -; GCN-NEXT: s_addc_u32 s1, s11, s8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v7, s1, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 -; GCN-NEXT: s_movk_i32 s9, 0xfff +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc +; GCN-NEXT: v_ashr_i64 v[2:3], v[0:1], 11 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v0 +; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, s9 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v0 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v4 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc -; GCN-NEXT: s_movk_i32 s0, 0xffe -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s8, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s8 -; GCN-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %r = sdiv <2 x i64> %x, @@ -6356,120 +5955,51 @@ ; ; GCN-LABEL: srem_i64_oddk_denom: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s2, 0xffed2705 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v2, v1, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s9, 0xfd81e19 +; GCN-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NEXT: s_mov_b32 s8, 0x6ca94220 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc -; GCN-NEXT: s_mov_b32 s5, s9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v4, v2, s2 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: s_ashr_i32 s2, s11, 31 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GCN-NEXT: s_add_u32 s0, s10, s2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_mov_b32 s3, s2 -; GCN-NEXT: s_addc_u32 s1, s11, s2 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s7, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: s_mul_i32 s1, s7, s9 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s1, v3 +; GCN-NEXT: s_mul_i32 s0, s6, s8 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s1, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 -; GCN-NEXT: s_mov_b32 s3, 0x12d8fb -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_hi_u32 v2, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GCN-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc +; GCN-NEXT: s_ashr_i32 s1, s7, 31 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v2, s1, v2 +; GCN-NEXT: s_mul_i32 s0, s7, s8 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v1 +; GCN-NEXT: s_mul_i32 s0, s1, s8 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GCN-NEXT: s_mul_i32 s1, s1, s9 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GCN-NEXT: v_ashr_i64 v[2:3], v[0:1], 19 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: s_mov_b32 s8, 0x12d8fb +; GCN-NEXT: v_mul_hi_u32 v2, v0, s8 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: v_mad_i32_i24 v1, v1, s8, v2 +; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v0 -; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s3, v2 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc -; GCN-NEXT: s_mov_b32 s0, 0x12d8fa -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = srem i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1424,110 +1424,33 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_udiv_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s2, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s10, 0xaaaaaaaa +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v4, v0, s2 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v5, v2, s2 -; GCN-NEXT: v_mul_lo_u32 v6, v0, s2 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: s_mov_b32 s5, s9 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s4, 0xaaaaaaab +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s7, v2 +; GCN-NEXT: s_mul_i32 s4, s7, s4 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; GCN-NEXT: s_mul_i32 s1, s6, s10 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, s1, v3 +; GCN-NEXT: v_mul_hi_u32 v3, s7, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 -; GCN-NEXT: v_mul_hi_u32 v3, v0, 24 -; GCN-NEXT: v_mul_lo_u32 v4, v0, 24 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, 24, v4 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], 23, v4 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: s_mul_i32 s1, s7, s10 +; GCN-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], 4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv_k_den_i64: @@ -1618,102 +1541,25 @@ ; GCN-LABEL: v_test_udiv_k_den_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GCN-NEXT: v_madak_f32 v2, 0, v2, 0x41c00000 -; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: s_movk_i32 s6, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v2, s6 -; GCN-NEXT: v_mul_lo_u32 v5, v3, s6 -; GCN-NEXT: v_mul_lo_u32 v6, v2, s6 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v6, vcc -; GCN-NEXT: v_mul_hi_u32 v6, v2, s6 -; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v7, v4, s6 -; GCN-NEXT: v_mul_lo_u32 v8, v2, s6 -; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v8 -; GCN-NEXT: v_mul_hi_u32 v12, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v13, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v10, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v12, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v8, v4, v8 -; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v13, v9, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v6, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[4:5] -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v6, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v1, v2 -; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v9, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v10, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 -; GCN-NEXT: v_mul_hi_u32 v5, v2, 24 -; GCN-NEXT: v_mul_lo_u32 v6, v2, 24 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, 24, v0 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v2 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v2 -; GCN-NEXT: v_cmp_lt_u32_e64 s[4:5], 23, v0 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GCN-NEXT: s_mov_b32 s4, 0xaaaaaaab +; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 +; GCN-NEXT: v_mul_hi_u32 v4, v0, s4 +; GCN-NEXT: s_mov_b32 s6, 0xaaaaaaaa +; GCN-NEXT: v_mul_hi_u32 v5, v1, s4 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s6 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, s6 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GCN-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 4 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 4, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IR-LABEL: v_test_udiv_k_den_i64: diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -958,108 +958,41 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_urem_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s2, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s2 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 -; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc +; GCN-NEXT: s_mov_b32 s10, 0xaaaaaaaa +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v4, v0, s2 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v5, v2, s2 -; GCN-NEXT: v_mul_lo_u32 v6, v0, s2 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: s_mov_b32 s9, s5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s4, 0xaaaaaaab +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s7, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: s_mul_i32 s4, s7, s4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; GCN-NEXT: s_mul_i32 s1, s6, s10 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, s1, v3 ; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_hi_u32 v2, v0, 24 -; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 -; GCN-NEXT: v_mul_lo_u32 v0, v0, 24 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 -; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 -; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v2 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], 23, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: s_mul_i32 s1, s7, s10 +; GCN-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, s1, v1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v0, v2, vcc +; GCN-NEXT: v_alignbit_b32 v1, v0, v1, 4 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mul_hi_u32 v2, v1, 24 +; GCN-NEXT: v_mul_lo_u32 v0, v0, 24 +; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v1 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem_k_den_i64: diff --git a/llvm/test/CodeGen/BPF/32-bit-subreg-alu.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-alu.ll --- a/llvm/test/CodeGen/BPF/32-bit-subreg-alu.ll +++ b/llvm/test/CodeGen/BPF/32-bit-subreg-alu.ll @@ -190,7 +190,10 @@ define dso_local i32 @div_i(i32 %a) local_unnamed_addr #0 { entry: %div = udiv i32 %a, 15 -; CHECK: w{{[0-9]+}} /= 15 +; CHECK: [[REG1:r[0-9]+]] = w{{[0-9]+}} +; CHECK: [[REG2:r[0-9]+]] = 2290649225 ll +; CHECK: [[REG1]] *= [[REG2]] +; CHECK: [[REG1]] >>= 35 ret i32 %div } diff --git a/llvm/test/CodeGen/BPF/sdiv_error.ll b/llvm/test/CodeGen/BPF/sdiv_error.ll --- a/llvm/test/CodeGen/BPF/sdiv_error.ll +++ b/llvm/test/CodeGen/BPF/sdiv_error.ll @@ -3,7 +3,7 @@ ; CHECK: Unsupport signed division ; Function Attrs: norecurse nounwind readnone -define i32 @test(i32 %len) #0 { - %1 = srem i32 %len, 15 +define i32 @test(i32 %len, i32 %rhs) #0 { + %1 = srem i32 %len, %rhs ret i32 %1 } diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -617,7 +617,6 @@ ; CHECK-LABEL: test_ds_cross_basic_blocks: ; CHECK: # %bb.0: ; CHECK-NEXT: cmplwi r4, 0 -; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill @@ -629,59 +628,57 @@ ; CHECK-NEXT: addi r6, r3, 4009 ; CHECK-NEXT: addis r5, r2, .LC0@toc@ha ; CHECK-NEXT: ld r5, .LC0@toc@l(r5) -; CHECK-NEXT: iselgt r8, r4, r7 -; CHECK-NEXT: lis r4, -21846 +; CHECK-NEXT: iselgt r4, r4, r7 ; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: li r9, -7 -; CHECK-NEXT: li r10, -6 +; CHECK-NEXT: li r8, -7 +; CHECK-NEXT: li r9, -6 +; CHECK-NEXT: li r10, 1 ; CHECK-NEXT: li r11, 1 ; CHECK-NEXT: li r12, 1 ; CHECK-NEXT: li r30, 1 ; CHECK-NEXT: ld r5, 0(r5) -; CHECK-NEXT: mtctr r8 -; CHECK-NEXT: li r8, -9 +; CHECK-NEXT: mtctr r4 +; CHECK-NEXT: li r4, -9 ; CHECK-NEXT: addi r5, r5, -1 -; CHECK-NEXT: ori r4, r4, 43691 ; CHECK-NEXT: li r29, 1 -; CHECK-NEXT: li r28, 1 ; CHECK-NEXT: b .LBB6_4 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB6_2: -; CHECK-NEXT: ldx r0, r6, r8 -; CHECK-NEXT: add r28, r0, r28 -; CHECK-NEXT: ld r0, -8(r6) +; CHECK-NEXT: ldx r0, r6, r4 ; CHECK-NEXT: add r29, r0, r29 +; CHECK-NEXT: ld r0, -8(r6) +; CHECK-NEXT: add r30, r0, r30 ; CHECK-NEXT: .LBB6_3: ; CHECK-NEXT: addi r6, r6, 1 -; CHECK-NEXT: mulld r0, r29, r28 -; CHECK-NEXT: mulld r0, r0, r30 +; CHECK-NEXT: mulld r0, r30, r29 ; CHECK-NEXT: mulld r0, r0, r12 ; CHECK-NEXT: mulld r0, r0, r11 +; CHECK-NEXT: mulld r0, r0, r10 ; CHECK-NEXT: maddld r3, r0, r7, r3 ; CHECK-NEXT: bdz .LBB6_9 ; CHECK-NEXT: .LBB6_4: ; CHECK-NEXT: lbzu r0, 1(r5) -; CHECK-NEXT: mulhwu r27, r0, r4 -; CHECK-NEXT: rlwinm r26, r27, 0, 0, 30 -; CHECK-NEXT: srwi r27, r27, 1 -; CHECK-NEXT: add r27, r27, r26 -; CHECK-NEXT: sub r0, r0, r27 +; CHECK-NEXT: mulli r28, r0, 171 +; CHECK-NEXT: rlwinm r27, r28, 24, 8, 30 +; CHECK-NEXT: srwi r28, r28, 9 +; CHECK-NEXT: add r28, r28, r27 +; CHECK-NEXT: sub r0, r0, r28 +; CHECK-NEXT: clrlwi r0, r0, 24 ; CHECK-NEXT: cmplwi r0, 1 ; CHECK-NEXT: beq cr0, .LBB6_2 ; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: clrlwi r0, r0, 24 ; CHECK-NEXT: cmplwi r0, 2 ; CHECK-NEXT: bne cr0, .LBB6_7 ; CHECK-NEXT: # %bb.6: -; CHECK-NEXT: ldx r0, r6, r9 -; CHECK-NEXT: add r30, r0, r30 -; CHECK-NEXT: ld r0, -4(r6) +; CHECK-NEXT: ldx r0, r6, r8 ; CHECK-NEXT: add r12, r0, r12 +; CHECK-NEXT: ld r0, -4(r6) +; CHECK-NEXT: add r11, r0, r11 ; CHECK-NEXT: b .LBB6_3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB6_7: -; CHECK-NEXT: ldx r0, r6, r10 -; CHECK-NEXT: add r11, r0, r11 +; CHECK-NEXT: ldx r0, r6, r9 +; CHECK-NEXT: add r10, r0, r10 ; CHECK-NEXT: ld r0, 0(r6) ; CHECK-NEXT: add r7, r0, r7 ; CHECK-NEXT: b .LBB6_3 @@ -692,7 +689,6 @@ ; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload ; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload ; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload ; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 diff --git a/llvm/test/CodeGen/PowerPC/srem-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-lkk.ll --- a/llvm/test/CodeGen/PowerPC/srem-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/srem-lkk.ll @@ -130,20 +130,93 @@ ; Don't fold i64 srem define i64 @dont_fold_srem_i64(i64 %x) { -; CHECK-LABEL: dont_fold_srem_i64: -; CHECK: # %bb.0: -; CHECK-NEXT: mflr 0 -; CHECK-NEXT: stw 0, 4(1) -; CHECK-NEXT: stwu 1, -16(1) -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset lr, 4 -; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: li 6, 98 -; CHECK-NEXT: bl __moddi3 -; CHECK-NEXT: lwz 0, 20(1) -; CHECK-NEXT: addi 1, 1, 16 -; CHECK-NEXT: mtlr 0 -; CHECK-NEXT: blr +; CHECK64-LABEL: dont_fold_srem_i64: +; CHECK64: # %bb.0: +; CHECK64-NEXT: mflr 0 +; CHECK64-NEXT: stw 0, 4(1) +; CHECK64-NEXT: stwu 1, -32(1) +; CHECK64-NEXT: .cfi_def_cfa_offset 32 +; CHECK64-NEXT: .cfi_offset lr, 4 +; CHECK64-NEXT: .cfi_offset r29, -12 +; CHECK64-NEXT: .cfi_offset r30, -8 +; CHECK64-NEXT: stw 29, 20(1) # 4-byte Folded Spill +; CHECK64-NEXT: mr 29, 3 +; CHECK64-NEXT: srawi 3, 3, 31 +; CHECK64-NEXT: lis 5, -17388 +; CHECK64-NEXT: li 7, 0 +; CHECK64-NEXT: stw 30, 24(1) # 4-byte Folded Spill +; CHECK64-NEXT: mr 30, 4 +; CHECK64-NEXT: lis 4, 21399 +; CHECK64-NEXT: ori 9, 4, 33436 +; CHECK64-NEXT: ori 10, 5, 58849 +; CHECK64-NEXT: mr 4, 3 +; CHECK64-NEXT: mr 5, 29 +; CHECK64-NEXT: mr 6, 30 +; CHECK64-NEXT: li 8, 0 +; CHECK64-NEXT: bl __multi3 +; CHECK64-NEXT: rotlwi 4, 4, 27 +; CHECK64-NEXT: srwi 6, 3, 31 +; CHECK64-NEXT: rlwimi 4, 3, 27, 0, 4 +; CHECK64-NEXT: srawi 3, 3, 5 +; CHECK64-NEXT: addc 4, 4, 6 +; CHECK64-NEXT: li 5, 98 +; CHECK64-NEXT: addze 3, 3 +; CHECK64-NEXT: mulhwu 5, 4, 5 +; CHECK64-NEXT: mulli 4, 4, 98 +; CHECK64-NEXT: mulli 3, 3, 98 +; CHECK64-NEXT: add 3, 5, 3 +; CHECK64-NEXT: subc 4, 30, 4 +; CHECK64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK64-NEXT: subfe 3, 3, 29 +; CHECK64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK64-NEXT: lwz 0, 36(1) +; CHECK64-NEXT: addi 1, 1, 32 +; CHECK64-NEXT: mtlr 0 +; CHECK64-NEXT: blr +; +; CHECK32-LABEL: dont_fold_srem_i64: +; CHECK32: # %bb.0: +; CHECK32-NEXT: mflr 0 +; CHECK32-NEXT: stw 0, 4(1) +; CHECK32-NEXT: stwu 1, -32(1) +; CHECK32-NEXT: .cfi_def_cfa_offset 32 +; CHECK32-NEXT: .cfi_offset lr, 4 +; CHECK32-NEXT: .cfi_offset r29, -12 +; CHECK32-NEXT: .cfi_offset r30, -8 +; CHECK32-NEXT: stw 29, 20(1) # 4-byte Folded Spill +; CHECK32-NEXT: mr 29, 3 +; CHECK32-NEXT: stw 30, 24(1) # 4-byte Folded Spill +; CHECK32-NEXT: mr 30, 4 +; CHECK32-NEXT: srawi 3, 3, 31 +; CHECK32-NEXT: lis 4, 21399 +; CHECK32-NEXT: lis 5, -17388 +; CHECK32-NEXT: ori 9, 4, 33436 +; CHECK32-NEXT: ori 10, 5, 58849 +; CHECK32-NEXT: mr 5, 29 +; CHECK32-NEXT: mr 6, 30 +; CHECK32-NEXT: mr 4, 3 +; CHECK32-NEXT: li 7, 0 +; CHECK32-NEXT: li 8, 0 +; CHECK32-NEXT: bl __multi3 +; CHECK32-NEXT: rotlwi 4, 4, 27 +; CHECK32-NEXT: srwi 6, 3, 31 +; CHECK32-NEXT: rlwimi 4, 3, 27, 0, 4 +; CHECK32-NEXT: srawi 3, 3, 5 +; CHECK32-NEXT: addc 4, 4, 6 +; CHECK32-NEXT: li 5, 98 +; CHECK32-NEXT: addze 3, 3 +; CHECK32-NEXT: mulhwu 5, 4, 5 +; CHECK32-NEXT: mulli 4, 4, 98 +; CHECK32-NEXT: subc 4, 30, 4 +; CHECK32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK32-NEXT: mulli 3, 3, 98 +; CHECK32-NEXT: add 3, 5, 3 +; CHECK32-NEXT: subfe 3, 3, 29 +; CHECK32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK32-NEXT: lwz 0, 36(1) +; CHECK32-NEXT: addi 1, 1, 32 +; CHECK32-NEXT: mtlr 0 +; CHECK32-NEXT: blr %1 = srem i64 %x, 98 ret i64 %1 } diff --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll --- a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -11,233 +11,128 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; P9LE-LABEL: fold_srem_vec_1: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, -21386 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 37253 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: add r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 31710 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: ori r4, r4, 63421 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: sub r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, -124 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 21399 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: ori r4, r4, 33437 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 5 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 98 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, -16728 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 63249 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 8 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, -1003 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P9LE-NEXT: vmrglh v4, v2, v2 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P9LE-NEXT: vextsh2w v4, v4 +; P9LE-NEXT: lxvx v3, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P9LE-NEXT: vmuluwm v3, v4, v3 +; P9LE-NEXT: vspltisw v4, 8 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P9LE-NEXT: vmladduhm v3, v2, v4, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P9LE-NEXT: vsrah v3, v3, v4 +; P9LE-NEXT: vspltish v4, 15 +; P9LE-NEXT: vsrh v4, v3, v4 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: fold_srem_vec_1: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: lis r4, 31710 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 63421 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: sub r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, -124 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, -21386 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 37253 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, -16728 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 63249 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 8 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, -1003 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, 21399 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 33437 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 5 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 98 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P9BE-NEXT: vmrghh v4, v2, v2 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P9BE-NEXT: vextsh2w v4, v4 +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P9BE-NEXT: vmuluwm v3, v4, v3 +; P9BE-NEXT: vspltisw v4, 8 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P9BE-NEXT: vmladduhm v3, v2, v4, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P9BE-NEXT: vsrah v3, v3, v4 +; P9BE-NEXT: vspltish v4, 15 +; P9BE-NEXT: vsrh v4, v3, v4 +; P9BE-NEXT: vadduhm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: fold_srem_vec_1: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 21399 -; P8LE-NEXT: lis r8, -16728 -; P8LE-NEXT: lis r9, -21386 -; P8LE-NEXT: lis r10, 31710 -; P8LE-NEXT: ori r3, r3, 33437 -; P8LE-NEXT: ori r8, r8, 63249 -; P8LE-NEXT: ori r9, r9, 37253 -; P8LE-NEXT: ori r10, r10, 63421 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: rldicl r5, r4, 32, 48 -; P8LE-NEXT: rldicl r6, r4, 16, 48 -; P8LE-NEXT: clrldi r7, r4, 48 -; P8LE-NEXT: extsh r5, r5 -; P8LE-NEXT: extsh r6, r6 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: extsh r7, r7 -; P8LE-NEXT: mulhw r3, r5, r3 -; P8LE-NEXT: extsh r4, r4 -; P8LE-NEXT: mulhw r8, r6, r8 -; P8LE-NEXT: mulhw r9, r7, r9 -; P8LE-NEXT: mulhw r10, r4, r10 -; P8LE-NEXT: srwi r11, r3, 31 -; P8LE-NEXT: srawi r3, r3, 5 -; P8LE-NEXT: add r3, r3, r11 -; P8LE-NEXT: srwi r11, r8, 31 -; P8LE-NEXT: add r9, r9, r7 -; P8LE-NEXT: srawi r8, r8, 8 -; P8LE-NEXT: sub r10, r10, r4 -; P8LE-NEXT: add r8, r8, r11 -; P8LE-NEXT: srwi r11, r9, 31 -; P8LE-NEXT: srawi r9, r9, 6 -; P8LE-NEXT: mulli r3, r3, 98 -; P8LE-NEXT: add r9, r9, r11 -; P8LE-NEXT: srwi r11, r10, 31 -; P8LE-NEXT: srawi r10, r10, 6 -; P8LE-NEXT: mulli r8, r8, -1003 -; P8LE-NEXT: add r10, r10, r11 -; P8LE-NEXT: mulli r9, r9, 95 -; P8LE-NEXT: mulli r10, r10, -124 -; P8LE-NEXT: sub r3, r5, r3 -; P8LE-NEXT: mtvsrd v2, r3 -; P8LE-NEXT: sub r5, r6, r8 -; P8LE-NEXT: sub r3, r7, r9 -; P8LE-NEXT: mtvsrd v3, r5 -; P8LE-NEXT: sub r4, r4, r10 -; P8LE-NEXT: mtvsrd v4, r3 -; P8LE-NEXT: mtvsrd v5, r4 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: vmrghh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: vmrglh v3, v2, v2 +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P8LE-NEXT: vslw v3, v3, v4 +; P8LE-NEXT: vsraw v3, v3, v4 +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vxor v5, v5, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vmladduhm v3, v2, v4, v3 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P8LE-NEXT: vsrah v3, v3, v4 +; P8LE-NEXT: vspltish v4, 15 +; P8LE-NEXT: vsrh v4, v3, v4 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: vmladduhm v3, v3, v4, v5 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: fold_srem_vec_1: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -16728 -; P8BE-NEXT: lis r8, 21399 -; P8BE-NEXT: lis r9, 31710 -; P8BE-NEXT: lis r10, -21386 -; P8BE-NEXT: ori r3, r3, 63249 -; P8BE-NEXT: ori r8, r8, 33437 -; P8BE-NEXT: ori r9, r9, 63421 -; P8BE-NEXT: ori r10, r10, 37253 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: extsh r5, r5 -; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: rldicl r4, r4, 16, 48 -; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: mulhw r3, r5, r3 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: mulhw r8, r6, r8 -; P8BE-NEXT: mulhw r9, r7, r9 -; P8BE-NEXT: mulhw r10, r4, r10 -; P8BE-NEXT: srwi r11, r3, 31 -; P8BE-NEXT: srawi r3, r3, 8 -; P8BE-NEXT: add r3, r3, r11 -; P8BE-NEXT: srwi r11, r8, 31 -; P8BE-NEXT: sub r9, r9, r7 -; P8BE-NEXT: srawi r8, r8, 5 -; P8BE-NEXT: add r10, r10, r4 -; P8BE-NEXT: add r8, r8, r11 -; P8BE-NEXT: srwi r11, r9, 31 -; P8BE-NEXT: srawi r9, r9, 6 -; P8BE-NEXT: mulli r3, r3, -1003 -; P8BE-NEXT: add r9, r9, r11 -; P8BE-NEXT: srwi r11, r10, 31 -; P8BE-NEXT: srawi r10, r10, 6 -; P8BE-NEXT: mulli r8, r8, 98 -; P8BE-NEXT: add r10, r10, r11 -; P8BE-NEXT: mulli r9, r9, -124 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: sub r3, r5, r3 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sub r5, r6, r8 -; P8BE-NEXT: mtvsrd v2, r3 -; P8BE-NEXT: sub r6, r7, r9 -; P8BE-NEXT: sldi r3, r5, 48 -; P8BE-NEXT: sub r4, r4, r10 -; P8BE-NEXT: mtvsrd v3, r3 -; P8BE-NEXT: sldi r3, r6, 48 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v5, v4 -; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v2, v2 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P8BE-NEXT: vslw v3, v3, v4 +; P8BE-NEXT: vsraw v3, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: vxor v5, v5, v5 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vmladduhm v3, v2, v4, v3 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: vsrah v3, v3, v4 +; P8BE-NEXT: vspltish v4, 15 +; P8BE-NEXT: vsrh v4, v3, v4 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vmladduhm v3, v3, v0, v5 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -246,217 +141,108 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; P9LE-LABEL: fold_srem_vec_2: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, -21386 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 37253 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r5, r3, r4 -; P9LE-NEXT: add r5, r5, r3 -; P9LE-NEXT: srwi r6, r5, 31 -; P9LE-NEXT: srawi r5, r5, 6 -; P9LE-NEXT: add r5, r5, r6 -; P9LE-NEXT: mulli r5, r5, 95 -; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r5, r3, r4 -; P9LE-NEXT: add r5, r5, r3 -; P9LE-NEXT: srwi r6, r5, 31 -; P9LE-NEXT: srawi r5, r5, 6 -; P9LE-NEXT: add r5, r5, r6 -; P9LE-NEXT: mulli r5, r5, 95 -; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r5, r3, r4 -; P9LE-NEXT: add r5, r5, r3 -; P9LE-NEXT: srwi r6, r5, 31 -; P9LE-NEXT: srawi r5, r5, 6 -; P9LE-NEXT: add r5, r5, r6 -; P9LE-NEXT: mulli r5, r5, 95 -; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: add r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P9LE-NEXT: vmrglh v4, v2, v2 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P9LE-NEXT: vextsh2w v4, v4 +; P9LE-NEXT: lxvx v3, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P9LE-NEXT: vmuluwm v3, v4, v3 +; P9LE-NEXT: vspltisw v4, 8 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: vspltish v4, 6 +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: vadduhm v3, v3, v2 +; P9LE-NEXT: vsrah v3, v3, v4 +; P9LE-NEXT: vspltish v4, 15 +; P9LE-NEXT: vsrh v4, v3, v4 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: fold_srem_vec_2: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r4, -21386 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 37253 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r5, r3, r4 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r6, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: sub r3, r3, r5 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r5, r3, r4 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r6, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: sub r3, r3, r5 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r5, r3, r4 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r6, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: sub r3, r3, r5 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P9BE-NEXT: vmrghh v4, v2, v2 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P9BE-NEXT: vextsh2w v4, v4 +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P9BE-NEXT: vmuluwm v3, v4, v3 +; P9BE-NEXT: vspltisw v4, 8 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: vspltish v4, 6 +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: vadduhm v3, v3, v2 +; P9BE-NEXT: vsrah v3, v3, v4 +; P9BE-NEXT: vspltish v4, 15 +; P9BE-NEXT: vsrh v4, v3, v4 +; P9BE-NEXT: vadduhm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: fold_srem_vec_2: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, -21386 -; P8LE-NEXT: ori r3, r3, 37253 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: clrldi r5, r4, 48 -; P8LE-NEXT: rldicl r6, r4, 48, 48 -; P8LE-NEXT: extsh r5, r5 -; P8LE-NEXT: rldicl r7, r4, 32, 48 -; P8LE-NEXT: extsh r6, r6 -; P8LE-NEXT: mulhw r8, r5, r3 -; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: extsh r7, r7 -; P8LE-NEXT: mulhw r9, r6, r3 -; P8LE-NEXT: extsh r4, r4 -; P8LE-NEXT: mulhw r10, r7, r3 -; P8LE-NEXT: mulhw r3, r4, r3 -; P8LE-NEXT: add r8, r8, r5 -; P8LE-NEXT: add r9, r9, r6 -; P8LE-NEXT: srwi r11, r8, 31 -; P8LE-NEXT: srawi r8, r8, 6 -; P8LE-NEXT: add r10, r10, r7 -; P8LE-NEXT: add r3, r3, r4 -; P8LE-NEXT: add r8, r8, r11 -; P8LE-NEXT: srwi r11, r9, 31 -; P8LE-NEXT: srawi r9, r9, 6 -; P8LE-NEXT: mulli r8, r8, 95 -; P8LE-NEXT: add r9, r9, r11 -; P8LE-NEXT: srwi r11, r10, 31 -; P8LE-NEXT: srawi r10, r10, 6 -; P8LE-NEXT: mulli r9, r9, 95 -; P8LE-NEXT: add r10, r10, r11 -; P8LE-NEXT: srwi r11, r3, 31 -; P8LE-NEXT: srawi r3, r3, 6 -; P8LE-NEXT: mulli r10, r10, 95 -; P8LE-NEXT: sub r5, r5, r8 -; P8LE-NEXT: add r3, r3, r11 -; P8LE-NEXT: mtvsrd v2, r5 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: sub r6, r6, r9 -; P8LE-NEXT: mtvsrd v3, r6 -; P8LE-NEXT: sub r5, r7, r10 -; P8LE-NEXT: mtvsrd v4, r5 -; P8LE-NEXT: sub r3, r4, r3 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v2, v2 +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P8LE-NEXT: vslw v3, v3, v4 +; P8LE-NEXT: vsraw v3, v3, v4 +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vxor v5, v5, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: vspltish v4, 6 +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vadduhm v3, v3, v2 +; P8LE-NEXT: vsrah v3, v3, v4 +; P8LE-NEXT: vspltish v4, 15 +; P8LE-NEXT: vsrh v4, v3, v4 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: vmladduhm v3, v3, v4, v5 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: fold_srem_vec_2: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -21386 -; P8BE-NEXT: ori r3, r3, 37253 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: extsh r5, r5 -; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: mulhw r8, r5, r3 -; P8BE-NEXT: rldicl r4, r4, 16, 48 -; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: mulhw r9, r6, r3 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: mulhw r10, r7, r3 -; P8BE-NEXT: mulhw r3, r4, r3 -; P8BE-NEXT: add r8, r8, r5 -; P8BE-NEXT: add r9, r9, r6 -; P8BE-NEXT: srwi r11, r8, 31 -; P8BE-NEXT: srawi r8, r8, 6 -; P8BE-NEXT: add r10, r10, r7 -; P8BE-NEXT: add r3, r3, r4 -; P8BE-NEXT: add r8, r8, r11 -; P8BE-NEXT: srwi r11, r9, 31 -; P8BE-NEXT: srawi r9, r9, 6 -; P8BE-NEXT: mulli r8, r8, 95 -; P8BE-NEXT: add r9, r9, r11 -; P8BE-NEXT: srwi r11, r10, 31 -; P8BE-NEXT: srawi r10, r10, 6 -; P8BE-NEXT: mulli r9, r9, 95 -; P8BE-NEXT: add r10, r10, r11 -; P8BE-NEXT: srwi r11, r3, 31 -; P8BE-NEXT: srawi r3, r3, 6 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: sub r5, r5, r8 -; P8BE-NEXT: add r3, r3, r11 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: sub r6, r6, r9 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: sub r7, r7, r10 -; P8BE-NEXT: mtvsrd v3, r6 -; P8BE-NEXT: sub r3, r4, r3 -; P8BE-NEXT: sldi r4, r7, 48 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r4 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v3, v5, v4 -; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v2, v2 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: vslw v3, v3, v4 +; P8BE-NEXT: vsraw v3, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: vxor v5, v5, v5 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: vspltish v4, 6 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vadduhm v3, v3, v2 +; P8BE-NEXT: vsrah v3, v3, v4 +; P8BE-NEXT: vspltish v4, 15 +; P8BE-NEXT: vsrh v4, v3, v4 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vmladduhm v3, v3, v0, v5 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -467,257 +253,112 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; P9LE-LABEL: combine_srem_sdiv: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, -21386 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 37253 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r5, r3, r4 -; P9LE-NEXT: add r5, r5, r3 -; P9LE-NEXT: srwi r6, r5, 31 -; P9LE-NEXT: srawi r5, r5, 6 -; P9LE-NEXT: add r5, r5, r6 -; P9LE-NEXT: mulli r6, r5, 95 -; P9LE-NEXT: sub r3, r3, r6 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r6, r3 -; P9LE-NEXT: mulhw r7, r6, r4 -; P9LE-NEXT: add r6, r7, r6 -; P9LE-NEXT: srwi r7, r6, 31 -; P9LE-NEXT: srawi r6, r6, 6 -; P9LE-NEXT: add r6, r6, r7 -; P9LE-NEXT: mulli r7, r6, 95 -; P9LE-NEXT: sub r3, r3, r7 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: extsh r7, r3 -; P9LE-NEXT: mulhw r8, r7, r4 -; P9LE-NEXT: add r7, r8, r7 -; P9LE-NEXT: srwi r8, r7, 31 -; P9LE-NEXT: srawi r7, r7, 6 -; P9LE-NEXT: add r7, r7, r8 -; P9LE-NEXT: mulli r8, r7, 95 -; P9LE-NEXT: sub r3, r3, r8 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r8, r3 -; P9LE-NEXT: mulhw r4, r8, r4 -; P9LE-NEXT: add r4, r4, r8 -; P9LE-NEXT: srwi r8, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r8 -; P9LE-NEXT: mulli r8, r4, 95 -; P9LE-NEXT: mtvsrd v5, r4 -; P9LE-NEXT: sub r3, r3, r8 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: mtvsrd v4, r6 -; P9LE-NEXT: vmrglw v2, v2, v3 -; P9LE-NEXT: mtvsrd v3, r5 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: mtvsrd v4, r7 -; P9LE-NEXT: vmrghh v4, v5, v4 -; P9LE-NEXT: vmrglw v3, v4, v3 +; P9LE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P9LE-NEXT: vmrglh v4, v2, v2 +; P9LE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P9LE-NEXT: vextsh2w v4, v4 +; P9LE-NEXT: lxvx v3, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P9LE-NEXT: lxvx v5, 0, r3 +; P9LE-NEXT: vmuluwm v3, v4, v3 +; P9LE-NEXT: vspltisw v4, 8 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: vspltish v4, 6 +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: vadduhm v3, v3, v2 +; P9LE-NEXT: vsrah v3, v3, v4 +; P9LE-NEXT: vspltish v4, 15 +; P9LE-NEXT: vsrh v4, v3, v4 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: vxor v4, v4, v4 +; P9LE-NEXT: vmladduhm v4, v3, v5, v4 +; P9LE-NEXT: vsubuhm v2, v2, v4 ; P9LE-NEXT: vadduhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: combine_srem_sdiv: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r5, -21386 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r5, r5, 37253 -; P9BE-NEXT: extsh r4, r3 -; P9BE-NEXT: mulhw r6, r4, r5 -; P9BE-NEXT: add r4, r6, r4 -; P9BE-NEXT: srwi r6, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r6 -; P9BE-NEXT: mulli r6, r4, 95 -; P9BE-NEXT: sub r3, r3, r6 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r6, r3 -; P9BE-NEXT: mulhw r7, r6, r5 -; P9BE-NEXT: add r6, r7, r6 -; P9BE-NEXT: srwi r7, r6, 31 -; P9BE-NEXT: srawi r6, r6, 6 -; P9BE-NEXT: add r6, r6, r7 -; P9BE-NEXT: mulli r7, r6, 95 -; P9BE-NEXT: sub r3, r3, r7 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: extsh r7, r3 -; P9BE-NEXT: mulhw r8, r7, r5 -; P9BE-NEXT: add r7, r8, r7 -; P9BE-NEXT: srwi r8, r7, 31 -; P9BE-NEXT: srawi r7, r7, 6 -; P9BE-NEXT: add r7, r7, r8 -; P9BE-NEXT: mulli r8, r7, 95 -; P9BE-NEXT: sub r3, r3, r8 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r5, r3, r5 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r8, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r8 -; P9BE-NEXT: mulli r8, r5, 95 -; P9BE-NEXT: sub r3, r3, r8 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: sldi r3, r4, 48 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v2, v3 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: sldi r3, r6, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: sldi r3, r7, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: sldi r3, r5, 48 -; P9BE-NEXT: mtvsrd v5, r3 -; P9BE-NEXT: vmrghh v4, v5, v4 -; P9BE-NEXT: vmrghw v3, v4, v3 +; P9BE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P9BE-NEXT: vmrghh v4, v2, v2 +; P9BE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P9BE-NEXT: vextsh2w v4, v4 +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P9BE-NEXT: lxvx v5, 0, r3 +; P9BE-NEXT: vmuluwm v3, v4, v3 +; P9BE-NEXT: vspltisw v4, 8 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: vspltish v4, 6 +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: vadduhm v3, v3, v2 +; P9BE-NEXT: vsrah v3, v3, v4 +; P9BE-NEXT: vspltish v4, 15 +; P9BE-NEXT: vsrh v4, v3, v4 +; P9BE-NEXT: vadduhm v3, v3, v4 +; P9BE-NEXT: vxor v4, v4, v4 +; P9BE-NEXT: vmladduhm v4, v3, v5, v4 +; P9BE-NEXT: vsubuhm v2, v2, v4 ; P9BE-NEXT: vadduhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: combine_srem_sdiv: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, -21386 -; P8LE-NEXT: ori r3, r3, 37253 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: clrldi r5, r4, 48 -; P8LE-NEXT: rldicl r6, r4, 48, 48 -; P8LE-NEXT: rldicl r7, r4, 32, 48 -; P8LE-NEXT: extsh r5, r5 -; P8LE-NEXT: extsh r8, r6 -; P8LE-NEXT: extsh r9, r7 -; P8LE-NEXT: mulhw r10, r5, r3 -; P8LE-NEXT: mulhw r11, r8, r3 -; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: mulhw r12, r9, r3 -; P8LE-NEXT: extsh r0, r4 -; P8LE-NEXT: mulhw r3, r0, r3 -; P8LE-NEXT: add r10, r10, r5 -; P8LE-NEXT: add r8, r11, r8 -; P8LE-NEXT: srwi r11, r10, 31 -; P8LE-NEXT: add r9, r12, r9 -; P8LE-NEXT: srawi r10, r10, 6 -; P8LE-NEXT: srawi r12, r8, 6 -; P8LE-NEXT: srwi r8, r8, 31 -; P8LE-NEXT: add r10, r10, r11 -; P8LE-NEXT: add r3, r3, r0 -; P8LE-NEXT: srawi r11, r9, 6 -; P8LE-NEXT: srwi r9, r9, 31 -; P8LE-NEXT: add r8, r12, r8 -; P8LE-NEXT: mtvsrd v2, r10 -; P8LE-NEXT: mulli r12, r10, 95 -; P8LE-NEXT: add r9, r11, r9 -; P8LE-NEXT: srwi r11, r3, 31 -; P8LE-NEXT: mtvsrd v3, r8 -; P8LE-NEXT: srawi r3, r3, 6 -; P8LE-NEXT: mulli r10, r8, 95 -; P8LE-NEXT: mtvsrd v4, r9 -; P8LE-NEXT: add r3, r3, r11 -; P8LE-NEXT: mulli r8, r9, 95 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: mulli r9, r3, 95 -; P8LE-NEXT: sub r5, r5, r12 -; P8LE-NEXT: sub r6, r6, r10 -; P8LE-NEXT: mtvsrd v3, r5 -; P8LE-NEXT: mtvsrd v5, r6 -; P8LE-NEXT: sub r5, r7, r8 -; P8LE-NEXT: sub r4, r4, r9 -; P8LE-NEXT: mtvsrd v0, r5 -; P8LE-NEXT: mtvsrd v1, r4 -; P8LE-NEXT: vmrghh v3, v5, v3 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v0, v1, v0 -; P8LE-NEXT: vmrghh v4, v5, v4 -; P8LE-NEXT: vmrglw v3, v0, v3 -; P8LE-NEXT: vmrglw v2, v4, v2 -; P8LE-NEXT: vadduhm v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v2, v2 +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P8LE-NEXT: lvx v0, 0, r3 +; P8LE-NEXT: vslw v3, v3, v4 +; P8LE-NEXT: vsraw v3, v3, v4 +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vxor v5, v5, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: vspltish v4, 6 +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vadduhm v3, v3, v2 +; P8LE-NEXT: vsrah v3, v3, v4 +; P8LE-NEXT: vspltish v4, 15 +; P8LE-NEXT: vsrh v4, v3, v4 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: vmladduhm v4, v3, v0, v5 +; P8LE-NEXT: vsubuhm v2, v2, v4 +; P8LE-NEXT: vadduhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: combine_srem_sdiv: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r5, v2 -; P8BE-NEXT: lis r4, -21386 -; P8BE-NEXT: ori r4, r4, 37253 -; P8BE-NEXT: clrldi r3, r5, 48 -; P8BE-NEXT: rldicl r6, r5, 48, 48 -; P8BE-NEXT: extsh r8, r3 -; P8BE-NEXT: rldicl r7, r5, 32, 48 -; P8BE-NEXT: extsh r9, r6 -; P8BE-NEXT: rldicl r5, r5, 16, 48 -; P8BE-NEXT: mulhw r11, r8, r4 -; P8BE-NEXT: extsh r10, r7 -; P8BE-NEXT: extsh r5, r5 -; P8BE-NEXT: mulhw r12, r9, r4 -; P8BE-NEXT: mulhw r0, r10, r4 -; P8BE-NEXT: mulhw r4, r5, r4 -; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: add r9, r12, r9 -; P8BE-NEXT: srawi r11, r8, 6 -; P8BE-NEXT: srwi r8, r8, 31 -; P8BE-NEXT: add r10, r0, r10 -; P8BE-NEXT: add r4, r4, r5 -; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: srawi r12, r9, 6 -; P8BE-NEXT: srwi r9, r9, 31 -; P8BE-NEXT: srawi r0, r10, 6 -; P8BE-NEXT: srawi r11, r4, 6 -; P8BE-NEXT: srwi r10, r10, 31 -; P8BE-NEXT: add r9, r12, r9 -; P8BE-NEXT: srwi r4, r4, 31 -; P8BE-NEXT: mulli r12, r8, 95 -; P8BE-NEXT: add r10, r0, r10 -; P8BE-NEXT: add r4, r11, r4 -; P8BE-NEXT: mulli r0, r9, 95 -; P8BE-NEXT: sldi r9, r9, 48 -; P8BE-NEXT: sldi r8, r8, 48 -; P8BE-NEXT: mtvsrd v3, r9 -; P8BE-NEXT: mulli r9, r4, 95 -; P8BE-NEXT: mtvsrd v2, r8 -; P8BE-NEXT: mulli r8, r10, 95 -; P8BE-NEXT: sldi r10, r10, 48 -; P8BE-NEXT: sub r3, r3, r12 -; P8BE-NEXT: mtvsrd v4, r10 -; P8BE-NEXT: sub r6, r6, r0 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: mtvsrd v3, r3 -; P8BE-NEXT: sub r3, r5, r9 -; P8BE-NEXT: sub r7, r7, r8 -; P8BE-NEXT: mtvsrd v5, r6 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sldi r5, r7, 48 -; P8BE-NEXT: mtvsrd v1, r3 -; P8BE-NEXT: sldi r3, r4, 48 -; P8BE-NEXT: mtvsrd v0, r5 -; P8BE-NEXT: vmrghh v3, v5, v3 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v0, v1, v0 -; P8BE-NEXT: vmrghh v4, v5, v4 -; P8BE-NEXT: vmrghw v3, v0, v3 -; P8BE-NEXT: vmrghw v2, v4, v2 -; P8BE-NEXT: vadduhm v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v2, v2 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: vslw v3, v3, v4 +; P8BE-NEXT: vsraw v3, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: vxor v5, v5, v5 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: vspltish v4, 6 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vadduhm v3, v3, v2 +; P8BE-NEXT: vsrah v3, v3, v4 +; P8BE-NEXT: vspltish v4, 15 +; P8BE-NEXT: vsrh v4, v3, v4 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vmladduhm v4, v3, v0, v5 +; P8BE-NEXT: vsubuhm v2, v2, v4 +; P8BE-NEXT: vadduhm v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, @@ -729,181 +370,116 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_srem_power_of_two: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: srawi r4, r3, 6 -; P9LE-NEXT: addze r4, r4 -; P9LE-NEXT: slwi r4, r4, 6 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: srawi r4, r3, 5 -; P9LE-NEXT: addze r4, r4 -; P9LE-NEXT: slwi r4, r4, 5 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, -21386 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 37253 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: add r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: srawi r4, r3, 3 -; P9LE-NEXT: addze r4, r4 -; P9LE-NEXT: slwi r4, r4, 3 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v4, v2 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P9LE-NEXT: vmrglh v3, v2, v2 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P9LE-NEXT: vextsh2w v3, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P9LE-NEXT: vmuluwm v3, v3, v4 +; P9LE-NEXT: vspltisw v4, 8 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P9LE-NEXT: vadduhm v3, v3, v2 +; P9LE-NEXT: vsrah v3, v3, v4 +; P9LE-NEXT: vspltish v4, 15 +; P9LE-NEXT: vsrh v4, v3, v4 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_srem_power_of_two: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: srawi r4, r3, 5 -; P9BE-NEXT: addze r4, r4 -; P9BE-NEXT: slwi r4, r4, 5 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: srawi r4, r3, 6 -; P9BE-NEXT: addze r4, r4 -; P9BE-NEXT: slwi r4, r4, 6 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, -21386 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 37253 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: srawi r4, r3, 3 -; P9BE-NEXT: addze r4, r4 -; P9BE-NEXT: slwi r4, r4, 3 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P9BE-NEXT: vmrghh v3, v2, v2 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P9BE-NEXT: vextsh2w v3, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v4 +; P9BE-NEXT: vspltisw v4, 8 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P9BE-NEXT: vadduhm v3, v3, v2 +; P9BE-NEXT: vsrah v3, v3, v4 +; P9BE-NEXT: vspltish v4, 15 +; P9BE-NEXT: vsrh v4, v3, v4 +; P9BE-NEXT: vadduhm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_srem_power_of_two: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, -21386 -; P8LE-NEXT: ori r3, r3, 37253 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: rldicl r5, r4, 16, 48 -; P8LE-NEXT: clrldi r6, r4, 48 -; P8LE-NEXT: extsh r5, r5 -; P8LE-NEXT: extsh r6, r6 -; P8LE-NEXT: mulhw r3, r5, r3 -; P8LE-NEXT: rldicl r7, r4, 48, 48 -; P8LE-NEXT: srawi r8, r6, 6 -; P8LE-NEXT: extsh r7, r7 -; P8LE-NEXT: addze r8, r8 -; P8LE-NEXT: rldicl r4, r4, 32, 48 -; P8LE-NEXT: srawi r9, r7, 5 -; P8LE-NEXT: extsh r4, r4 -; P8LE-NEXT: slwi r8, r8, 6 -; P8LE-NEXT: add r3, r3, r5 -; P8LE-NEXT: addze r9, r9 -; P8LE-NEXT: sub r6, r6, r8 -; P8LE-NEXT: srwi r10, r3, 31 -; P8LE-NEXT: srawi r3, r3, 6 -; P8LE-NEXT: slwi r8, r9, 5 -; P8LE-NEXT: mtvsrd v2, r6 -; P8LE-NEXT: add r3, r3, r10 -; P8LE-NEXT: srawi r9, r4, 3 -; P8LE-NEXT: sub r6, r7, r8 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: addze r7, r9 -; P8LE-NEXT: mtvsrd v3, r6 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: sub r3, r5, r3 -; P8LE-NEXT: slwi r5, r7, 3 -; P8LE-NEXT: sub r4, r4, r5 -; P8LE-NEXT: mtvsrd v4, r3 -; P8LE-NEXT: mtvsrd v5, r4 -; P8LE-NEXT: vmrghh v3, v4, v5 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v2, v2 +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P8LE-NEXT: vslw v3, v3, v4 +; P8LE-NEXT: vsraw v3, v3, v4 +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vxor v5, v5, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vadduhm v3, v3, v2 +; P8LE-NEXT: vsrah v3, v3, v4 +; P8LE-NEXT: vspltish v4, 15 +; P8LE-NEXT: vsrh v4, v3, v4 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: vmladduhm v3, v3, v4, v5 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_srem_power_of_two: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -21386 -; P8BE-NEXT: ori r3, r3, 37253 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 32, 48 -; P8BE-NEXT: extsh r5, r5 -; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: mulhw r3, r5, r3 -; P8BE-NEXT: rldicl r7, r4, 16, 48 -; P8BE-NEXT: srawi r8, r6, 5 -; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: addze r8, r8 -; P8BE-NEXT: rldicl r4, r4, 48, 48 -; P8BE-NEXT: srawi r9, r7, 6 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: slwi r8, r8, 5 -; P8BE-NEXT: add r3, r3, r5 -; P8BE-NEXT: addze r9, r9 -; P8BE-NEXT: sub r6, r6, r8 -; P8BE-NEXT: srwi r10, r3, 31 -; P8BE-NEXT: srawi r3, r3, 6 -; P8BE-NEXT: slwi r8, r9, 6 -; P8BE-NEXT: add r3, r3, r10 -; P8BE-NEXT: srawi r9, r4, 3 -; P8BE-NEXT: sub r7, r7, r8 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: addze r8, r9 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: slwi r6, r8, 3 -; P8BE-NEXT: sub r4, r4, r6 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: sub r3, r5, r3 -; P8BE-NEXT: sldi r5, r7, 48 -; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v3, r5 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v5, v4 -; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: vmrghh v3, v2, v2 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P8BE-NEXT: vslw v3, v3, v4 +; P8BE-NEXT: vsraw v3, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: vxor v5, v5, v5 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vadduhm v3, v3, v2 +; P8BE-NEXT: vsrah v3, v3, v4 +; P8BE-NEXT: vspltish v4, 15 +; P8BE-NEXT: vsrh v4, v3, v4 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vmladduhm v3, v3, v0, v5 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -913,195 +489,146 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_srem_one: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: lis r4, -14230 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 30865 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: add r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 9 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 654 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, -19946 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: ori r4, r4, 17097 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v3, v4 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: add r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 4 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 23 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 24749 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 47143 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 11 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 5423 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P9LE-NEXT: vmrglh v5, v2, v2 +; P9LE-NEXT: vspltisw v3, -16 +; P9LE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P9LE-NEXT: vextsh2w v5, v5 +; P9LE-NEXT: vsrw v3, v3, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P9LE-NEXT: xxland v3, v2, v3 +; P9LE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P9LE-NEXT: vmuluwm v4, v5, v4 +; P9LE-NEXT: vspltisw v5, 8 +; P9LE-NEXT: vadduwm v5, v5, v5 +; P9LE-NEXT: vsrw v4, v4, v5 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: vpkuwum v4, v4, v4 +; P9LE-NEXT: vadduhm v3, v4, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P9LE-NEXT: lxvx vs0, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P9LE-NEXT: vsrah v3, v3, v4 +; P9LE-NEXT: vspltish v4, 15 +; P9LE-NEXT: vsrh v4, v3, v4 +; P9LE-NEXT: xxland v4, v4, vs0 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_srem_one: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: lis r4, -19946 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 17097 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 4 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 23 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, 24749 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 47143 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 11 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 5423 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, -14230 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 30865 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v3, v4 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 9 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 654 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: vmrghh v2, v4, v2 -; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P9BE-NEXT: vmrghh v5, v2, v2 +; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P9BE-NEXT: vextsh2w v5, v5 +; P9BE-NEXT: lxvx vs0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P9BE-NEXT: xxland v3, v2, vs0 +; P9BE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P9BE-NEXT: vmuluwm v4, v5, v4 +; P9BE-NEXT: vspltisw v5, 8 +; P9BE-NEXT: vadduwm v5, v5, v5 +; P9BE-NEXT: vsrw v4, v4, v5 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: vpkuwum v4, v4, v4 +; P9BE-NEXT: vadduhm v3, v4, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P9BE-NEXT: lxvx vs0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_4@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_4@toc@l +; P9BE-NEXT: vsrah v3, v3, v4 +; P9BE-NEXT: vspltish v4, 15 +; P9BE-NEXT: vsrh v4, v3, v4 +; P9BE-NEXT: xxland v4, v4, vs0 +; P9BE-NEXT: vadduhm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_srem_one: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r5, 24749 -; P8LE-NEXT: lis r6, -19946 -; P8LE-NEXT: lis r8, -14230 -; P8LE-NEXT: ori r5, r5, 47143 -; P8LE-NEXT: ori r6, r6, 17097 -; P8LE-NEXT: ori r8, r8, 30865 -; P8LE-NEXT: mffprd r3, f0 -; P8LE-NEXT: rldicl r4, r3, 16, 48 -; P8LE-NEXT: rldicl r7, r3, 32, 48 -; P8LE-NEXT: rldicl r3, r3, 48, 48 -; P8LE-NEXT: extsh r4, r4 -; P8LE-NEXT: extsh r7, r7 -; P8LE-NEXT: extsh r3, r3 -; P8LE-NEXT: mulhw r5, r4, r5 -; P8LE-NEXT: mulhw r6, r7, r6 -; P8LE-NEXT: mulhw r8, r3, r8 -; P8LE-NEXT: srwi r9, r5, 31 -; P8LE-NEXT: srawi r5, r5, 11 -; P8LE-NEXT: add r6, r6, r7 -; P8LE-NEXT: add r8, r8, r3 -; P8LE-NEXT: add r5, r5, r9 -; P8LE-NEXT: srwi r9, r6, 31 -; P8LE-NEXT: srawi r6, r6, 4 -; P8LE-NEXT: add r6, r6, r9 -; P8LE-NEXT: srwi r9, r8, 31 -; P8LE-NEXT: srawi r8, r8, 9 -; P8LE-NEXT: mulli r5, r5, 5423 -; P8LE-NEXT: add r8, r8, r9 -; P8LE-NEXT: mulli r6, r6, 23 -; P8LE-NEXT: li r9, 0 -; P8LE-NEXT: mulli r8, r8, 654 -; P8LE-NEXT: mtvsrd v2, r9 -; P8LE-NEXT: sub r4, r4, r5 -; P8LE-NEXT: sub r5, r7, r6 -; P8LE-NEXT: mtvsrd v3, r4 -; P8LE-NEXT: sub r3, r3, r8 -; P8LE-NEXT: mtvsrd v4, r5 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v3, v3, v4 -; P8LE-NEXT: vmrghh v2, v5, v2 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v2, v2 +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P8LE-NEXT: vxor v0, v0, v0 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P8LE-NEXT: vslw v3, v3, v4 +; P8LE-NEXT: vsraw v3, v3, v4 +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: vspltisw v4, -16 +; P8LE-NEXT: vsrw v4, v4, v4 +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: xxland v4, v2, v4 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P8LE-NEXT: vsrah v3, v3, v4 +; P8LE-NEXT: vspltish v4, 15 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P8LE-NEXT: vsrh v4, v3, v4 +; P8LE-NEXT: xxland v4, v4, v5 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: vmladduhm v3, v3, v4, v0 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_srem_one: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r3, v2 -; P8BE-NEXT: lis r5, 24749 -; P8BE-NEXT: lis r6, -19946 -; P8BE-NEXT: lis r8, -14230 -; P8BE-NEXT: ori r5, r5, 47143 -; P8BE-NEXT: ori r6, r6, 17097 -; P8BE-NEXT: ori r8, r8, 30865 -; P8BE-NEXT: clrldi r4, r3, 48 -; P8BE-NEXT: rldicl r7, r3, 48, 48 -; P8BE-NEXT: rldicl r3, r3, 32, 48 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: extsh r3, r3 -; P8BE-NEXT: mulhw r5, r4, r5 -; P8BE-NEXT: mulhw r6, r7, r6 -; P8BE-NEXT: mulhw r8, r3, r8 -; P8BE-NEXT: srwi r9, r5, 31 -; P8BE-NEXT: srawi r5, r5, 11 -; P8BE-NEXT: add r6, r6, r7 -; P8BE-NEXT: add r8, r8, r3 -; P8BE-NEXT: add r5, r5, r9 -; P8BE-NEXT: srwi r9, r6, 31 -; P8BE-NEXT: srawi r6, r6, 4 -; P8BE-NEXT: add r6, r6, r9 -; P8BE-NEXT: srwi r9, r8, 31 -; P8BE-NEXT: srawi r8, r8, 9 -; P8BE-NEXT: mulli r5, r5, 5423 -; P8BE-NEXT: add r8, r8, r9 -; P8BE-NEXT: mulli r6, r6, 23 -; P8BE-NEXT: li r9, 0 -; P8BE-NEXT: mulli r8, r8, 654 -; P8BE-NEXT: sub r4, r4, r5 -; P8BE-NEXT: sldi r5, r9, 48 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: sub r5, r7, r6 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: sub r3, r3, r8 -; P8BE-NEXT: mtvsrd v3, r4 -; P8BE-NEXT: sldi r4, r5, 48 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r4 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v3, v4, v3 -; P8BE-NEXT: vmrghh v2, v2, v5 -; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: vmrghh v3, v2, v2 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P8BE-NEXT: lxvw4x vs0, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P8BE-NEXT: vslw v3, v3, v4 +; P8BE-NEXT: vsraw v3, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: xxland v4, v2, vs0 +; P8BE-NEXT: lxvw4x vs0, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_4@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_4@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vspltish v4, 15 +; P8BE-NEXT: vsrah v3, v3, v5 +; P8BE-NEXT: vxor v5, v5, v5 +; P8BE-NEXT: vsrh v4, v3, v4 +; P8BE-NEXT: xxland v4, v4, vs0 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vmladduhm v3, v3, v0, v5 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -1111,175 +638,144 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_urem_i16_smax: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: lis r4, -19946 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 17097 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: add r4, r4, r3 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 4 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 23 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 24749 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 47143 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: mulhw r4, r3, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 11 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 5423 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: srawi r4, r3, 15 -; P9LE-NEXT: addze r4, r4 -; P9LE-NEXT: slwi r4, r4, 15 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: addis r3, r2, .LCPI5_0@toc@ha +; P9LE-NEXT: vmrglh v3, v2, v2 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: addi r3, r3, .LCPI5_0@toc@l +; P9LE-NEXT: vextsh2w v3, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI5_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI5_1@toc@l +; P9LE-NEXT: vmuluwm v3, v3, v4 +; P9LE-NEXT: vspltisw v4, 8 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI5_2@toc@ha +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: addi r3, r3, .LCPI5_2@toc@l +; P9LE-NEXT: vmladduhm v3, v2, v4, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI5_3@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI5_3@toc@l +; P9LE-NEXT: lxvx vs0, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI5_4@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI5_4@toc@l +; P9LE-NEXT: vsrah v3, v3, v4 +; P9LE-NEXT: vspltish v4, 15 +; P9LE-NEXT: vsrh v4, v3, v4 +; P9LE-NEXT: xxland v4, v4, vs0 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_urem_i16_smax: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: lis r4, -19946 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 17097 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 4 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 23 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, 24749 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 47143 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r4, r3, r4 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 11 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 5423 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v3, v4 -; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: srawi r4, r3, 15 -; P9BE-NEXT: addze r4, r4 -; P9BE-NEXT: slwi r4, r4, 15 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: vmrghh v2, v4, v2 -; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: addis r3, r2, .LCPI5_0@toc@ha +; P9BE-NEXT: vmrghh v3, v2, v2 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI5_0@toc@l +; P9BE-NEXT: vextsh2w v3, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI5_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI5_1@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v4 +; P9BE-NEXT: vspltisw v4, 8 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI5_2@toc@ha +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: addi r3, r3, .LCPI5_2@toc@l +; P9BE-NEXT: vmladduhm v3, v2, v4, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI5_3@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI5_3@toc@l +; P9BE-NEXT: lxvx vs0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI5_4@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI5_4@toc@l +; P9BE-NEXT: vsrah v3, v3, v4 +; P9BE-NEXT: vspltish v4, 15 +; P9BE-NEXT: vsrh v4, v3, v4 +; P9BE-NEXT: xxland v4, v4, vs0 +; P9BE-NEXT: vadduhm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_urem_i16_smax: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r4, 24749 -; P8LE-NEXT: lis r5, -19946 -; P8LE-NEXT: ori r4, r4, 47143 -; P8LE-NEXT: ori r5, r5, 17097 -; P8LE-NEXT: mffprd r3, f0 -; P8LE-NEXT: rldicl r6, r3, 16, 48 -; P8LE-NEXT: rldicl r7, r3, 32, 48 -; P8LE-NEXT: extsh r6, r6 -; P8LE-NEXT: extsh r7, r7 -; P8LE-NEXT: mulhw r4, r6, r4 -; P8LE-NEXT: mulhw r5, r7, r5 -; P8LE-NEXT: rldicl r3, r3, 48, 48 -; P8LE-NEXT: extsh r3, r3 -; P8LE-NEXT: srwi r8, r4, 31 -; P8LE-NEXT: srawi r4, r4, 11 -; P8LE-NEXT: add r5, r5, r7 -; P8LE-NEXT: add r4, r4, r8 -; P8LE-NEXT: srwi r8, r5, 31 -; P8LE-NEXT: srawi r5, r5, 4 -; P8LE-NEXT: mulli r4, r4, 5423 -; P8LE-NEXT: add r5, r5, r8 -; P8LE-NEXT: srawi r9, r3, 15 -; P8LE-NEXT: li r8, 0 -; P8LE-NEXT: mulli r5, r5, 23 -; P8LE-NEXT: mtvsrd v2, r8 -; P8LE-NEXT: sub r4, r6, r4 -; P8LE-NEXT: addze r6, r9 -; P8LE-NEXT: slwi r6, r6, 15 -; P8LE-NEXT: mtvsrd v3, r4 -; P8LE-NEXT: sub r5, r7, r5 -; P8LE-NEXT: sub r3, r3, r6 -; P8LE-NEXT: mtvsrd v4, r5 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v3, v3, v4 -; P8LE-NEXT: vmrghh v2, v5, v2 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v2, v2 +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addis r3, r2, .LCPI5_0@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI5_0@toc@l +; P8LE-NEXT: vxor v0, v0, v0 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI5_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI5_1@toc@l +; P8LE-NEXT: vslw v3, v3, v4 +; P8LE-NEXT: vsraw v3, v3, v4 +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI5_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI5_2@toc@l +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vmladduhm v3, v2, v4, v3 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI5_3@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI5_3@toc@l +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI5_4@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI5_4@toc@l +; P8LE-NEXT: vsrah v3, v3, v4 +; P8LE-NEXT: vspltish v4, 15 +; P8LE-NEXT: vsrh v4, v3, v4 +; P8LE-NEXT: xxland v4, v4, v5 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: vmladduhm v3, v3, v4, v0 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_i16_smax: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r3, v2 -; P8BE-NEXT: lis r4, 24749 -; P8BE-NEXT: lis r5, -19946 -; P8BE-NEXT: ori r4, r4, 47143 -; P8BE-NEXT: ori r5, r5, 17097 -; P8BE-NEXT: clrldi r6, r3, 48 -; P8BE-NEXT: rldicl r7, r3, 48, 48 -; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: mulhw r4, r6, r4 -; P8BE-NEXT: mulhw r5, r7, r5 -; P8BE-NEXT: rldicl r3, r3, 32, 48 -; P8BE-NEXT: extsh r3, r3 -; P8BE-NEXT: srwi r8, r4, 31 -; P8BE-NEXT: srawi r4, r4, 11 -; P8BE-NEXT: add r5, r5, r7 -; P8BE-NEXT: add r4, r4, r8 -; P8BE-NEXT: srwi r8, r5, 31 -; P8BE-NEXT: srawi r5, r5, 4 -; P8BE-NEXT: mulli r4, r4, 5423 -; P8BE-NEXT: add r5, r5, r8 -; P8BE-NEXT: li r8, 0 -; P8BE-NEXT: mulli r5, r5, 23 -; P8BE-NEXT: srawi r9, r3, 15 -; P8BE-NEXT: sub r4, r6, r4 -; P8BE-NEXT: sldi r6, r8, 48 -; P8BE-NEXT: addze r8, r9 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: slwi r6, r8, 15 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: sub r5, r7, r5 -; P8BE-NEXT: sub r3, r3, r6 -; P8BE-NEXT: mtvsrd v3, r4 -; P8BE-NEXT: sldi r4, r5, 48 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r4 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v3, v4, v3 -; P8BE-NEXT: vmrghh v2, v2, v5 -; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: vmrghh v3, v2, v2 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addis r3, r2, .LCPI5_0@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI5_0@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI5_1@toc@ha +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI5_1@toc@l +; P8BE-NEXT: vslw v3, v3, v4 +; P8BE-NEXT: vsraw v3, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: vxor v5, v5, v5 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI5_2@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI5_2@toc@l +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vmladduhm v3, v2, v4, v3 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI5_3@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI5_3@toc@l +; P8BE-NEXT: lxvw4x vs0, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI5_4@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI5_4@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: vsrah v3, v3, v4 +; P8BE-NEXT: vspltish v4, 15 +; P8BE-NEXT: vsrh v4, v3, v4 +; P8BE-NEXT: xxland v4, v4, vs0 +; P8BE-NEXT: vadduhm v3, v3, v4 +; P8BE-NEXT: vmladduhm v3, v3, v0, v5 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -1289,89 +785,182 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; P9LE-LABEL: dont_fold_srem_i64: ; P9LE: # %bb.0: -; P9LE-NEXT: lis r4, 24749 -; P9LE-NEXT: mfvsrd r3, v3 -; P9LE-NEXT: ori r4, r4, 47142 -; P9LE-NEXT: sldi r4, r4, 32 -; P9LE-NEXT: oris r4, r4, 58853 -; P9LE-NEXT: ori r4, r4, 6055 -; P9LE-NEXT: mulhd r4, r3, r4 -; P9LE-NEXT: rldicl r5, r4, 1, 63 -; P9LE-NEXT: sradi r4, r4, 11 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, -19946 -; P9LE-NEXT: mulli r4, r4, 5423 -; P9LE-NEXT: ori r5, r5, 17096 -; P9LE-NEXT: sldi r5, r5, 32 -; P9LE-NEXT: oris r5, r5, 22795 -; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: lis r3, 24749 +; P9LE-NEXT: mfvsrd r4, v3 +; P9LE-NEXT: ori r3, r3, 47142 +; P9LE-NEXT: sradi r5, r4, 63 +; P9LE-NEXT: sldi r3, r3, 32 +; P9LE-NEXT: oris r3, r3, 58853 +; P9LE-NEXT: ori r3, r3, 6055 +; P9LE-NEXT: mulhdu r6, r4, r3 +; P9LE-NEXT: maddld r5, r5, r3, r6 +; P9LE-NEXT: lis r6, -19946 +; P9LE-NEXT: mulld r3, r4, r3 ; P9LE-NEXT: mfvsrld r4, v3 +; P9LE-NEXT: ori r6, r6, 17096 +; P9LE-NEXT: sldi r6, r6, 32 +; P9LE-NEXT: mtvsrdd v4, r5, r3 +; P9LE-NEXT: sradi r3, r4, 63 +; P9LE-NEXT: oris r5, r6, 22795 ; P9LE-NEXT: ori r5, r5, 8549 -; P9LE-NEXT: mulhd r5, r4, r5 -; P9LE-NEXT: add r5, r5, r4 -; P9LE-NEXT: rldicl r6, r5, 1, 63 -; P9LE-NEXT: sradi r5, r5, 4 -; P9LE-NEXT: add r5, r5, r6 -; P9LE-NEXT: mulli r5, r5, 23 -; P9LE-NEXT: sub r4, r4, r5 -; P9LE-NEXT: mtvsrdd v3, r3, r4 -; P9LE-NEXT: lis r4, 25653 -; P9LE-NEXT: mfvsrd r3, v2 -; P9LE-NEXT: ori r4, r4, 15432 -; P9LE-NEXT: sldi r4, r4, 32 -; P9LE-NEXT: oris r4, r4, 1603 -; P9LE-NEXT: ori r4, r4, 21445 -; P9LE-NEXT: mulhd r4, r3, r4 -; P9LE-NEXT: rldicl r5, r4, 1, 63 -; P9LE-NEXT: sradi r4, r4, 8 -; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulhdu r6, r4, r5 +; P9LE-NEXT: sub r6, r6, r4 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: maddld r3, r3, r5, r6 +; P9LE-NEXT: lis r6, 25653 +; P9LE-NEXT: mfvsrd r5, v2 +; P9LE-NEXT: ori r6, r6, 15432 +; P9LE-NEXT: sldi r6, r6, 32 +; P9LE-NEXT: mtvsrdd v5, r3, r4 +; P9LE-NEXT: addis r3, r2, .LCPI6_0@toc@ha +; P9LE-NEXT: addis r4, r2, .LCPI6_1@toc@ha +; P9LE-NEXT: oris r6, r6, 1603 +; P9LE-NEXT: addi r3, r3, .LCPI6_0@toc@l +; P9LE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; P9LE-NEXT: ori r6, r6, 21445 +; P9LE-NEXT: lxvx v0, 0, r3 +; P9LE-NEXT: lxvx vs0, 0, r4 +; P9LE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P9LE-NEXT: addis r4, r2, .LCPI6_3@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P9LE-NEXT: addi r4, r4, .LCPI6_3@toc@l +; P9LE-NEXT: lxvx v6, 0, r4 +; P9LE-NEXT: vspltb v1, v0, 15 +; P9LE-NEXT: vsro v4, v4, v0 +; P9LE-NEXT: vsro v5, v5, v0 +; P9LE-NEXT: vsr v4, v4, v1 +; P9LE-NEXT: vsr v5, v5, v1 +; P9LE-NEXT: xxmrgld v4, v4, v5 +; P9LE-NEXT: xxland v5, v3, vs0 +; P9LE-NEXT: vaddudm v4, v4, v5 +; P9LE-NEXT: lxvx v5, 0, r3 +; P9LE-NEXT: vsrad v6, v4, v6 +; P9LE-NEXT: vsrd v4, v4, v5 +; P9LE-NEXT: vaddudm v4, v6, v4 +; P9LE-NEXT: xxlxor v6, v6, v6 +; P9LE-NEXT: mfvsrld r3, v4 +; P9LE-NEXT: mfvsrd r4, v4 +; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: mulli r3, r3, 23 +; P9LE-NEXT: mtvsrdd v4, r4, r3 +; P9LE-NEXT: mulhdu r3, r5, r6 +; P9LE-NEXT: sradi r4, r5, 63 +; P9LE-NEXT: vsubudm v3, v3, v4 +; P9LE-NEXT: maddld r3, r4, r6, r3 +; P9LE-NEXT: mulld r4, r5, r6 +; P9LE-NEXT: mtvsrdd v4, r3, r4 +; P9LE-NEXT: addis r3, r2, .LCPI6_4@toc@ha +; P9LE-NEXT: vsro v4, v4, v0 +; P9LE-NEXT: vsro v0, v6, v0 +; P9LE-NEXT: addi r3, r3, .LCPI6_4@toc@l +; P9LE-NEXT: vsr v4, v4, v1 +; P9LE-NEXT: vsr v0, v0, v1 +; P9LE-NEXT: xxmrgld v4, v4, v0 +; P9LE-NEXT: xxland v0, v2, vs0 +; P9LE-NEXT: vaddudm v4, v4, v0 +; P9LE-NEXT: lxvx v0, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI6_5@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI6_5@toc@l +; P9LE-NEXT: lxvx vs0, 0, r3 +; P9LE-NEXT: vsrad v0, v4, v0 +; P9LE-NEXT: vsrd v4, v4, v5 +; P9LE-NEXT: xxland v4, v4, vs0 +; P9LE-NEXT: vaddudm v4, v0, v4 +; P9LE-NEXT: mfvsrd r4, v4 +; P9LE-NEXT: mfvsrld r3, v4 ; P9LE-NEXT: mulli r4, r4, 654 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: li r4, 0 -; P9LE-NEXT: mtvsrdd v2, r3, r4 +; P9LE-NEXT: mtvsrdd v4, r4, r3 +; P9LE-NEXT: vsubudm v2, v2, v4 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_srem_i64: ; P9BE: # %bb.0: -; P9BE-NEXT: lis r4, 24749 -; P9BE-NEXT: mfvsrld r3, v3 -; P9BE-NEXT: ori r4, r4, 47142 -; P9BE-NEXT: sldi r4, r4, 32 -; P9BE-NEXT: oris r4, r4, 58853 -; P9BE-NEXT: ori r4, r4, 6055 -; P9BE-NEXT: mulhd r4, r3, r4 -; P9BE-NEXT: rldicl r5, r4, 1, 63 -; P9BE-NEXT: sradi r4, r4, 11 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: lis r5, -19946 -; P9BE-NEXT: ori r5, r5, 17096 -; P9BE-NEXT: mulli r4, r4, 5423 -; P9BE-NEXT: sldi r5, r5, 32 -; P9BE-NEXT: oris r5, r5, 22795 -; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: lis r3, 24749 +; P9BE-NEXT: mfvsrld r4, v3 +; P9BE-NEXT: ori r3, r3, 47142 +; P9BE-NEXT: sradi r5, r4, 63 +; P9BE-NEXT: sldi r3, r3, 32 +; P9BE-NEXT: oris r3, r3, 58853 +; P9BE-NEXT: ori r3, r3, 6055 +; P9BE-NEXT: mulhdu r6, r4, r3 +; P9BE-NEXT: maddld r5, r5, r3, r6 +; P9BE-NEXT: lis r6, -19946 +; P9BE-NEXT: mulld r3, r4, r3 ; P9BE-NEXT: mfvsrd r4, v3 +; P9BE-NEXT: ori r6, r6, 17096 +; P9BE-NEXT: sldi r6, r6, 32 +; P9BE-NEXT: mtvsrdd v4, r5, r3 +; P9BE-NEXT: sradi r3, r4, 63 +; P9BE-NEXT: oris r5, r6, 22795 ; P9BE-NEXT: ori r5, r5, 8549 -; P9BE-NEXT: mulhd r5, r4, r5 -; P9BE-NEXT: add r5, r5, r4 -; P9BE-NEXT: rldicl r6, r5, 1, 63 -; P9BE-NEXT: sradi r5, r5, 4 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 23 -; P9BE-NEXT: sub r4, r4, r5 -; P9BE-NEXT: mtvsrdd v3, r4, r3 -; P9BE-NEXT: lis r4, 25653 -; P9BE-NEXT: mfvsrld r3, v2 -; P9BE-NEXT: ori r4, r4, 15432 -; P9BE-NEXT: sldi r4, r4, 32 -; P9BE-NEXT: oris r4, r4, 1603 -; P9BE-NEXT: ori r4, r4, 21445 -; P9BE-NEXT: mulhd r4, r3, r4 -; P9BE-NEXT: rldicl r5, r4, 1, 63 -; P9BE-NEXT: sradi r4, r4, 8 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 654 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: mtvsrdd v2, 0, r3 +; P9BE-NEXT: mulhdu r6, r4, r5 +; P9BE-NEXT: sub r6, r6, r4 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: maddld r3, r3, r5, r6 +; P9BE-NEXT: lis r6, 25653 +; P9BE-NEXT: mfvsrld r5, v2 +; P9BE-NEXT: ori r6, r6, 15432 +; P9BE-NEXT: sldi r6, r6, 32 +; P9BE-NEXT: mtvsrdd v5, r3, r4 +; P9BE-NEXT: addis r3, r2, .LCPI6_0@toc@ha +; P9BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha +; P9BE-NEXT: oris r6, r6, 1603 +; P9BE-NEXT: addi r3, r3, .LCPI6_0@toc@l +; P9BE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; P9BE-NEXT: ori r6, r6, 21445 +; P9BE-NEXT: lxvx v0, 0, r3 +; P9BE-NEXT: lxvx vs0, 0, r4 +; P9BE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P9BE-NEXT: addis r4, r2, .LCPI6_3@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P9BE-NEXT: addi r4, r4, .LCPI6_3@toc@l +; P9BE-NEXT: lxvx v6, 0, r4 +; P9BE-NEXT: vspltb v1, v0, 15 +; P9BE-NEXT: vsro v4, v4, v0 +; P9BE-NEXT: vsro v5, v5, v0 +; P9BE-NEXT: vsr v4, v4, v1 +; P9BE-NEXT: vsr v5, v5, v1 +; P9BE-NEXT: xxmrgld v4, v5, v4 +; P9BE-NEXT: xxland v5, v3, vs0 +; P9BE-NEXT: vaddudm v4, v4, v5 +; P9BE-NEXT: lxvx v5, 0, r3 +; P9BE-NEXT: vsrad v6, v4, v6 +; P9BE-NEXT: vsrd v4, v4, v5 +; P9BE-NEXT: vaddudm v4, v6, v4 +; P9BE-NEXT: xxlxor v6, v6, v6 +; P9BE-NEXT: mfvsrld r3, v4 +; P9BE-NEXT: mfvsrd r4, v4 +; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: mulli r3, r3, 5423 +; P9BE-NEXT: mtvsrdd v4, r4, r3 +; P9BE-NEXT: mulhdu r3, r5, r6 +; P9BE-NEXT: sradi r4, r5, 63 +; P9BE-NEXT: vsubudm v3, v3, v4 +; P9BE-NEXT: maddld r3, r4, r6, r3 +; P9BE-NEXT: mulld r4, r5, r6 +; P9BE-NEXT: mtvsrdd v4, r3, r4 +; P9BE-NEXT: addis r3, r2, .LCPI6_4@toc@ha +; P9BE-NEXT: vsro v4, v4, v0 +; P9BE-NEXT: vsro v0, v6, v0 +; P9BE-NEXT: addi r3, r3, .LCPI6_4@toc@l +; P9BE-NEXT: vsr v4, v4, v1 +; P9BE-NEXT: vsr v0, v0, v1 +; P9BE-NEXT: xxmrgld v4, v0, v4 +; P9BE-NEXT: xxland v0, v2, vs0 +; P9BE-NEXT: vaddudm v4, v4, v0 +; P9BE-NEXT: lxvx v0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI6_5@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI6_5@toc@l +; P9BE-NEXT: lxvx vs0, 0, r3 +; P9BE-NEXT: vsrad v0, v4, v0 +; P9BE-NEXT: vsrd v4, v4, v5 +; P9BE-NEXT: xxland v4, v4, vs0 +; P9BE-NEXT: vaddudm v4, v0, v4 +; P9BE-NEXT: mfvsrld r3, v4 +; P9BE-NEXT: mfvsrd r4, v4 +; P9BE-NEXT: mulli r3, r3, 654 +; P9BE-NEXT: mtvsrdd v4, r4, r3 +; P9BE-NEXT: vsubudm v2, v2, v4 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_srem_i64: @@ -1390,37 +979,79 @@ ; P8LE-NEXT: sldi r5, r5, 32 ; P8LE-NEXT: oris r3, r3, 58853 ; P8LE-NEXT: oris r4, r4, 22795 -; P8LE-NEXT: mffprd r8, f0 +; P8LE-NEXT: mffprd r10, f0 +; P8LE-NEXT: sradi r8, r6, 63 ; P8LE-NEXT: oris r5, r5, 1603 ; P8LE-NEXT: ori r3, r3, 6055 ; P8LE-NEXT: ori r4, r4, 8549 +; P8LE-NEXT: sradi r9, r7, 63 ; P8LE-NEXT: ori r5, r5, 21445 -; P8LE-NEXT: mulhd r3, r6, r3 -; P8LE-NEXT: mulhd r5, r7, r5 -; P8LE-NEXT: mulhd r4, r8, r4 -; P8LE-NEXT: rldicl r9, r3, 1, 63 -; P8LE-NEXT: sradi r3, r3, 11 -; P8LE-NEXT: add r3, r3, r9 -; P8LE-NEXT: rldicl r9, r5, 1, 63 -; P8LE-NEXT: add r4, r4, r8 -; P8LE-NEXT: sradi r5, r5, 8 +; P8LE-NEXT: mulld r8, r8, r3 +; P8LE-NEXT: mulhdu r3, r6, r3 +; P8LE-NEXT: mulld r6, r9, r5 +; P8LE-NEXT: sradi r9, r10, 63 +; P8LE-NEXT: mulhdu r11, r10, r4 +; P8LE-NEXT: mulhdu r5, r7, r5 +; P8LE-NEXT: addis r7, r2, .LCPI6_0@toc@ha +; P8LE-NEXT: mulld r4, r9, r4 +; P8LE-NEXT: addi r7, r7, .LCPI6_0@toc@l +; P8LE-NEXT: lxvd2x vs0, 0, r7 +; P8LE-NEXT: add r3, r3, r8 +; P8LE-NEXT: li r7, 0 +; P8LE-NEXT: mtfprd f2, r3 +; P8LE-NEXT: sub r3, r11, r10 +; P8LE-NEXT: add r5, r5, r6 +; P8LE-NEXT: mtfprd f1, r7 +; P8LE-NEXT: addis r6, r2, .LCPI6_3@toc@ha +; P8LE-NEXT: add r3, r3, r4 +; P8LE-NEXT: addis r4, r2, .LCPI6_1@toc@ha +; P8LE-NEXT: xxswapd v4, vs0 +; P8LE-NEXT: mtfprd f0, r5 +; P8LE-NEXT: addi r6, r6, .LCPI6_3@toc@l +; P8LE-NEXT: mtfprd f4, r3 +; P8LE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P8LE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; P8LE-NEXT: lxvd2x vs3, 0, r6 +; P8LE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P8LE-NEXT: xxmrghd v5, vs0, vs1 +; P8LE-NEXT: lxvd2x vs0, 0, r4 +; P8LE-NEXT: addis r4, r2, .LCPI6_4@toc@ha +; P8LE-NEXT: xxmrghd v0, vs2, vs4 +; P8LE-NEXT: lxvd2x vs1, 0, r3 +; P8LE-NEXT: xxland v1, v2, v4 +; P8LE-NEXT: addi r4, r4, .LCPI6_4@toc@l +; P8LE-NEXT: xxland v4, v3, v4 +; P8LE-NEXT: xxswapd v6, vs3 +; P8LE-NEXT: lxvd2x vs2, 0, r4 +; P8LE-NEXT: vaddudm v5, v5, v1 +; P8LE-NEXT: xxswapd v1, vs0 +; P8LE-NEXT: vaddudm v4, v0, v4 +; P8LE-NEXT: xxswapd v0, vs1 +; P8LE-NEXT: vsrad v6, v5, v6 +; P8LE-NEXT: xxswapd v7, vs2 +; P8LE-NEXT: vsrd v5, v5, v1 +; P8LE-NEXT: vsrd v1, v4, v1 +; P8LE-NEXT: vsrad v4, v4, v0 +; P8LE-NEXT: xxland v5, v5, v7 +; P8LE-NEXT: vaddudm v4, v4, v1 +; P8LE-NEXT: vaddudm v5, v6, v5 +; P8LE-NEXT: xxswapd vs0, v4 +; P8LE-NEXT: mfvsrd r3, v4 +; P8LE-NEXT: mfvsrd r5, v5 +; P8LE-NEXT: xxswapd vs1, v5 ; P8LE-NEXT: mulli r3, r3, 5423 -; P8LE-NEXT: add r5, r5, r9 -; P8LE-NEXT: rldicl r9, r4, 1, 63 -; P8LE-NEXT: sradi r4, r4, 4 +; P8LE-NEXT: mffprd r4, f0 ; P8LE-NEXT: mulli r5, r5, 654 -; P8LE-NEXT: add r4, r4, r9 +; P8LE-NEXT: mffprd r6, f1 ; P8LE-NEXT: mulli r4, r4, 23 -; P8LE-NEXT: sub r3, r6, r3 -; P8LE-NEXT: mtfprd f0, r3 -; P8LE-NEXT: sub r5, r7, r5 -; P8LE-NEXT: mtfprd f1, r5 -; P8LE-NEXT: sub r3, r8, r4 -; P8LE-NEXT: li r4, 0 -; P8LE-NEXT: mtfprd f2, r3 -; P8LE-NEXT: mtfprd f3, r4 -; P8LE-NEXT: xxmrghd v3, vs0, vs2 -; P8LE-NEXT: xxmrghd v2, vs1, vs3 +; P8LE-NEXT: mtfprd f0, r6 +; P8LE-NEXT: mtfprd f1, r3 +; P8LE-NEXT: mtfprd f3, r5 +; P8LE-NEXT: mtfprd f2, r4 +; P8LE-NEXT: xxmrghd v5, vs3, vs0 +; P8LE-NEXT: xxmrghd v4, vs1, vs2 +; P8LE-NEXT: vsubudm v2, v2, v5 +; P8LE-NEXT: vsubudm v3, v3, v4 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_srem_i64: @@ -1428,49 +1059,86 @@ ; P8BE-NEXT: lis r4, -19946 ; P8BE-NEXT: lis r3, 24749 ; P8BE-NEXT: xxswapd vs0, v3 +; P8BE-NEXT: mfvsrd r6, v3 ; P8BE-NEXT: lis r5, 25653 -; P8BE-NEXT: xxswapd vs1, v2 ; P8BE-NEXT: ori r4, r4, 17096 ; P8BE-NEXT: ori r3, r3, 47142 +; P8BE-NEXT: xxswapd vs1, v2 ; P8BE-NEXT: ori r5, r5, 15432 -; P8BE-NEXT: mfvsrd r6, v3 ; P8BE-NEXT: sldi r4, r4, 32 ; P8BE-NEXT: sldi r3, r3, 32 ; P8BE-NEXT: oris r4, r4, 22795 -; P8BE-NEXT: sldi r5, r5, 32 ; P8BE-NEXT: oris r3, r3, 58853 -; P8BE-NEXT: mffprd r7, f0 +; P8BE-NEXT: mffprd r8, f0 +; P8BE-NEXT: sradi r7, r6, 63 ; P8BE-NEXT: ori r4, r4, 8549 ; P8BE-NEXT: ori r3, r3, 6055 +; P8BE-NEXT: mffprd r9, f1 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: mulld r7, r7, r4 +; P8BE-NEXT: mulhdu r4, r6, r4 ; P8BE-NEXT: oris r5, r5, 1603 -; P8BE-NEXT: mffprd r8, f1 -; P8BE-NEXT: mulhd r4, r6, r4 -; P8BE-NEXT: mulhd r3, r7, r3 +; P8BE-NEXT: mulhdu r10, r8, r3 +; P8BE-NEXT: sradi r8, r8, 63 ; P8BE-NEXT: ori r5, r5, 21445 -; P8BE-NEXT: mulhd r5, r8, r5 -; P8BE-NEXT: add r4, r4, r6 -; P8BE-NEXT: rldicl r9, r3, 1, 63 -; P8BE-NEXT: sradi r3, r3, 11 -; P8BE-NEXT: rldicl r10, r4, 1, 63 -; P8BE-NEXT: sradi r4, r4, 4 -; P8BE-NEXT: add r3, r3, r9 -; P8BE-NEXT: rldicl r9, r5, 1, 63 -; P8BE-NEXT: add r4, r4, r10 -; P8BE-NEXT: sradi r5, r5, 8 -; P8BE-NEXT: mulli r3, r3, 5423 -; P8BE-NEXT: add r5, r5, r9 -; P8BE-NEXT: mulli r4, r4, 23 -; P8BE-NEXT: mulli r5, r5, 654 -; P8BE-NEXT: sub r3, r7, r3 -; P8BE-NEXT: sub r4, r6, r4 +; P8BE-NEXT: mulld r3, r8, r3 +; P8BE-NEXT: sradi r8, r9, 63 +; P8BE-NEXT: mulhdu r9, r9, r5 +; P8BE-NEXT: mulld r5, r8, r5 +; P8BE-NEXT: sub r4, r4, r6 +; P8BE-NEXT: li r6, 0 +; P8BE-NEXT: mtfprd f0, r6 +; P8BE-NEXT: add r4, r4, r7 +; P8BE-NEXT: addis r6, r2, .LCPI6_0@toc@ha +; P8BE-NEXT: mtfprd f1, r4 +; P8BE-NEXT: addi r4, r6, .LCPI6_0@toc@l +; P8BE-NEXT: add r3, r10, r3 +; P8BE-NEXT: lxvw4x vs2, 0, r4 +; P8BE-NEXT: add r4, r9, r5 +; P8BE-NEXT: mtfprd f3, r3 +; P8BE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P8BE-NEXT: addis r5, r2, .LCPI6_3@toc@ha +; P8BE-NEXT: mtfprd f4, r4 +; P8BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P8BE-NEXT: xxmrghd v4, vs1, vs3 +; P8BE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; P8BE-NEXT: lxvd2x v1, 0, r3 +; P8BE-NEXT: addi r3, r5, .LCPI6_3@toc@l +; P8BE-NEXT: xxmrghd v0, vs0, vs4 +; P8BE-NEXT: xxland v5, v3, vs2 +; P8BE-NEXT: xxland v6, v2, vs2 +; P8BE-NEXT: vaddudm v4, v4, v5 +; P8BE-NEXT: lxvd2x v5, 0, r4 +; P8BE-NEXT: vaddudm v0, v0, v6 +; P8BE-NEXT: lxvd2x v6, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI6_4@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI6_4@toc@l +; P8BE-NEXT: vsrad v1, v4, v1 +; P8BE-NEXT: lxvw4x vs0, 0, r3 +; P8BE-NEXT: vsrd v4, v4, v5 +; P8BE-NEXT: vsrd v5, v0, v5 +; P8BE-NEXT: vsrad v6, v0, v6 +; P8BE-NEXT: xxland v5, v5, vs0 +; P8BE-NEXT: vaddudm v4, v1, v4 +; P8BE-NEXT: vaddudm v5, v6, v5 +; P8BE-NEXT: mfvsrd r3, v4 +; P8BE-NEXT: xxswapd vs0, v4 +; P8BE-NEXT: xxswapd vs1, v5 +; P8BE-NEXT: mulli r3, r3, 23 +; P8BE-NEXT: mffprd r4, f0 +; P8BE-NEXT: mffprd r5, f1 +; P8BE-NEXT: mulli r4, r4, 5423 ; P8BE-NEXT: mtfprd f0, r3 -; P8BE-NEXT: sub r3, r8, r5 +; P8BE-NEXT: mulli r3, r5, 654 +; P8BE-NEXT: mfvsrd r5, v5 ; P8BE-NEXT: mtfprd f1, r4 -; P8BE-NEXT: li r4, 0 -; P8BE-NEXT: mtfprd f2, r3 -; P8BE-NEXT: mtfprd f3, r4 -; P8BE-NEXT: xxmrghd v3, vs1, vs0 -; P8BE-NEXT: xxmrghd v2, vs3, vs2 +; P8BE-NEXT: mtfprd f2, r5 +; P8BE-NEXT: mtfprd f3, r3 +; P8BE-NEXT: xxmrghd v4, vs0, vs1 +; P8BE-NEXT: xxmrghd v5, vs2, vs3 +; P8BE-NEXT: vsubudm v3, v3, v4 +; P8BE-NEXT: vsubudm v2, v2, v5 ; P8BE-NEXT: blr %1 = srem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/PowerPC/urem-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-lkk.ll --- a/llvm/test/CodeGen/PowerPC/urem-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-lkk.ll @@ -87,20 +87,89 @@ ; Don't fold i64 urem define i64 @dont_fold_urem_i64(i64 %x) { -; CHECK-LABEL: dont_fold_urem_i64: -; CHECK: # %bb.0: -; CHECK-NEXT: mflr 0 -; CHECK-NEXT: stw 0, 4(1) -; CHECK-NEXT: stwu 1, -16(1) -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset lr, 4 -; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: li 6, 98 -; CHECK-NEXT: bl __umoddi3 -; CHECK-NEXT: lwz 0, 20(1) -; CHECK-NEXT: addi 1, 1, 16 -; CHECK-NEXT: mtlr 0 -; CHECK-NEXT: blr +; CHECK64-LABEL: dont_fold_urem_i64: +; CHECK64: # %bb.0: +; CHECK64-NEXT: mflr 0 +; CHECK64-NEXT: stw 0, 4(1) +; CHECK64-NEXT: stwu 1, -32(1) +; CHECK64-NEXT: .cfi_def_cfa_offset 32 +; CHECK64-NEXT: .cfi_offset lr, 4 +; CHECK64-NEXT: .cfi_offset r29, -12 +; CHECK64-NEXT: .cfi_offset r30, -8 +; CHECK64-NEXT: rotlwi 6, 4, 31 +; CHECK64-NEXT: stw 29, 20(1) # 4-byte Folded Spill +; CHECK64-NEXT: mr 29, 3 +; CHECK64-NEXT: rlwimi 6, 3, 31, 0, 0 +; CHECK64-NEXT: srwi 5, 3, 1 +; CHECK64-NEXT: stw 30, 24(1) # 4-byte Folded Spill +; CHECK64-NEXT: mr 30, 4 +; CHECK64-NEXT: lis 3, 21399 +; CHECK64-NEXT: lis 4, -17388 +; CHECK64-NEXT: ori 9, 3, 33436 +; CHECK64-NEXT: ori 10, 4, 58849 +; CHECK64-NEXT: li 3, 0 +; CHECK64-NEXT: li 4, 0 +; CHECK64-NEXT: li 7, 0 +; CHECK64-NEXT: li 8, 0 +; CHECK64-NEXT: bl __multi3 +; CHECK64-NEXT: rotlwi 4, 4, 28 +; CHECK64-NEXT: li 5, 98 +; CHECK64-NEXT: rlwimi 4, 3, 28, 0, 3 +; CHECK64-NEXT: srwi 3, 3, 4 +; CHECK64-NEXT: mulhwu 5, 4, 5 +; CHECK64-NEXT: mulli 3, 3, 98 +; CHECK64-NEXT: mulli 4, 4, 98 +; CHECK64-NEXT: add 3, 5, 3 +; CHECK64-NEXT: subc 4, 30, 4 +; CHECK64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK64-NEXT: subfe 3, 3, 29 +; CHECK64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK64-NEXT: lwz 0, 36(1) +; CHECK64-NEXT: addi 1, 1, 32 +; CHECK64-NEXT: mtlr 0 +; CHECK64-NEXT: blr +; +; CHECK32-LABEL: dont_fold_urem_i64: +; CHECK32: # %bb.0: +; CHECK32-NEXT: mflr 0 +; CHECK32-NEXT: stw 0, 4(1) +; CHECK32-NEXT: stwu 1, -32(1) +; CHECK32-NEXT: .cfi_def_cfa_offset 32 +; CHECK32-NEXT: .cfi_offset lr, 4 +; CHECK32-NEXT: .cfi_offset r29, -12 +; CHECK32-NEXT: .cfi_offset r30, -8 +; CHECK32-NEXT: rotlwi 6, 4, 31 +; CHECK32-NEXT: stw 29, 20(1) # 4-byte Folded Spill +; CHECK32-NEXT: mr 29, 3 +; CHECK32-NEXT: stw 30, 24(1) # 4-byte Folded Spill +; CHECK32-NEXT: mr 30, 4 +; CHECK32-NEXT: rlwimi 6, 3, 31, 0, 0 +; CHECK32-NEXT: srwi 5, 3, 1 +; CHECK32-NEXT: lis 3, 21399 +; CHECK32-NEXT: lis 4, -17388 +; CHECK32-NEXT: ori 9, 3, 33436 +; CHECK32-NEXT: ori 10, 4, 58849 +; CHECK32-NEXT: li 3, 0 +; CHECK32-NEXT: li 4, 0 +; CHECK32-NEXT: li 7, 0 +; CHECK32-NEXT: li 8, 0 +; CHECK32-NEXT: bl __multi3 +; CHECK32-NEXT: rotlwi 4, 4, 28 +; CHECK32-NEXT: li 5, 98 +; CHECK32-NEXT: rlwimi 4, 3, 28, 0, 3 +; CHECK32-NEXT: srwi 3, 3, 4 +; CHECK32-NEXT: mulhwu 5, 4, 5 +; CHECK32-NEXT: mulli 3, 3, 98 +; CHECK32-NEXT: add 3, 5, 3 +; CHECK32-NEXT: mulli 4, 4, 98 +; CHECK32-NEXT: subc 4, 30, 4 +; CHECK32-NEXT: subfe 3, 3, 29 +; CHECK32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK32-NEXT: lwz 0, 36(1) +; CHECK32-NEXT: addi 1, 1, 32 +; CHECK32-NEXT: mtlr 0 +; CHECK32-NEXT: blr %1 = urem i64 %x, 98 ret i64 %1 } diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll --- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -11,209 +11,156 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; P9LE-LABEL: fold_urem_vec_1: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: lis r4, 21399 -; P9LE-NEXT: lis r5, 8456 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 33437 -; P9LE-NEXT: ori r5, r5, 16913 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: srwi r4, r4, 5 -; P9LE-NEXT: mulli r4, r4, 98 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 16727 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 2287 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: srwi r4, r4, 8 -; P9LE-NEXT: mulli r4, r4, 1003 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r4, r3, 16 -; P9LE-NEXT: rlwinm r3, r3, 30, 18, 31 -; P9LE-NEXT: mulhwu r3, r3, r5 -; P9LE-NEXT: srwi r3, r3, 2 -; P9LE-NEXT: mulli r3, r3, 124 -; P9LE-NEXT: sub r3, r4, r3 -; P9LE-NEXT: lis r4, 22765 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: ori r4, r4, 8969 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: sub r5, r3, r4 -; P9LE-NEXT: srwi r5, r5, 1 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v4, v2 -; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P9LE-NEXT: lxvx v3, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P9LE-NEXT: lxvx v5, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P9LE-NEXT: vsrh v3, v2, v3 +; P9LE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: vmuluwm v3, v3, v5 +; P9LE-NEXT: vspltisw v5, 8 +; P9LE-NEXT: vadduwm v5, v5, v5 +; P9LE-NEXT: vsrw v3, v3, v5 +; P9LE-NEXT: vpkuwum v0, v3, v3 +; P9LE-NEXT: vsubuhm v0, v2, v0 +; P9LE-NEXT: vmrglh v4, v4, v0 +; P9LE-NEXT: lxvx v0, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P9LE-NEXT: vmuluwm v4, v4, v0 +; P9LE-NEXT: vsrw v4, v4, v5 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: vadduhm v3, v4, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI0_4@toc@ha +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: addi r3, r3, .LCPI0_4@toc@l +; P9LE-NEXT: vsrh v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: fold_urem_vec_1: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r4, 16727 -; P9BE-NEXT: lis r5, 8456 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 2287 -; P9BE-NEXT: ori r5, r5, 16913 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: srwi r4, r4, 8 -; P9BE-NEXT: mulli r4, r4, 1003 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, 21399 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 33437 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: srwi r4, r4, 5 -; P9BE-NEXT: mulli r4, r4, 98 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31 -; P9BE-NEXT: mulhwu r3, r3, r5 -; P9BE-NEXT: srwi r3, r3, 2 -; P9BE-NEXT: mulli r3, r3, 124 -; P9BE-NEXT: sub r3, r4, r3 -; P9BE-NEXT: lis r4, 22765 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 8969 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: sub r5, r3, r4 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P9BE-NEXT: xxlxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P9BE-NEXT: vsrh v3, v2, v3 +; P9BE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P9BE-NEXT: lxvx v0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P9BE-NEXT: vperm v3, v5, v3, v4 +; P9BE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v0 +; P9BE-NEXT: vspltisw v0, 8 +; P9BE-NEXT: vadduwm v0, v0, v0 +; P9BE-NEXT: vsrw v3, v3, v0 +; P9BE-NEXT: vpkuwum v1, v3, v3 +; P9BE-NEXT: vsubuhm v1, v2, v1 +; P9BE-NEXT: vperm v4, v5, v1, v4 +; P9BE-NEXT: lxvx v5, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_4@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI0_4@toc@l +; P9BE-NEXT: vmuluwm v4, v4, v5 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: vsrw v4, v4, v0 +; P9BE-NEXT: vadduhm v3, v4, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI0_5@toc@ha +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: addi r3, r3, .LCPI0_5@toc@l +; P9BE-NEXT: vsrh v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: fold_urem_vec_1: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: lis r7, 21399 -; P8LE-NEXT: lis r9, 16727 -; P8LE-NEXT: lis r10, 8456 -; P8LE-NEXT: ori r3, r3, 8969 -; P8LE-NEXT: ori r7, r7, 33437 -; P8LE-NEXT: ori r9, r9, 2287 -; P8LE-NEXT: ori r10, r10, 16913 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: clrldi r6, r4, 48 -; P8LE-NEXT: rldicl r5, r4, 32, 48 -; P8LE-NEXT: clrlwi r6, r6, 16 -; P8LE-NEXT: rldicl r8, r4, 16, 48 -; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: mulhwu r3, r6, r3 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: clrlwi r8, r8, 16 -; P8LE-NEXT: rlwinm r11, r4, 30, 18, 31 -; P8LE-NEXT: mulhwu r7, r5, r7 -; P8LE-NEXT: clrlwi r4, r4, 16 -; P8LE-NEXT: mulhwu r9, r8, r9 -; P8LE-NEXT: mulhwu r10, r11, r10 -; P8LE-NEXT: sub r11, r6, r3 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: srwi r7, r7, 5 -; P8LE-NEXT: add r3, r11, r3 -; P8LE-NEXT: srwi r9, r9, 8 -; P8LE-NEXT: srwi r10, r10, 2 -; P8LE-NEXT: srwi r3, r3, 6 -; P8LE-NEXT: mulli r7, r7, 98 -; P8LE-NEXT: mulli r9, r9, 1003 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: mulli r10, r10, 124 -; P8LE-NEXT: sub r5, r5, r7 -; P8LE-NEXT: sub r7, r8, r9 -; P8LE-NEXT: sub r3, r6, r3 -; P8LE-NEXT: mtvsrd v2, r5 -; P8LE-NEXT: sub r4, r4, r10 -; P8LE-NEXT: mtvsrd v3, r7 -; P8LE-NEXT: mtvsrd v4, r3 -; P8LE-NEXT: mtvsrd v5, r4 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: vmrghh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P8LE-NEXT: xxlxor v4, v4, v4 +; P8LE-NEXT: vspltisw v5, 8 +; P8LE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P8LE-NEXT: lvx v3, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P8LE-NEXT: vadduwm v5, v5, v5 +; P8LE-NEXT: vsrh v3, v2, v3 +; P8LE-NEXT: lvx v0, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P8LE-NEXT: vmrglh v3, v4, v3 +; P8LE-NEXT: vmuluwm v3, v3, v0 +; P8LE-NEXT: vsrw v3, v3, v5 +; P8LE-NEXT: vpkuwum v0, v3, v3 +; P8LE-NEXT: vsubuhm v0, v2, v0 +; P8LE-NEXT: vmrglh v4, v4, v0 +; P8LE-NEXT: lvx v0, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P8LE-NEXT: vmuluwm v4, v4, v0 +; P8LE-NEXT: vsrw v4, v4, v5 +; P8LE-NEXT: vadduhm v3, v4, v3 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI0_4@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI0_4@toc@l +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: vsrh v3, v3, v4 +; P8LE-NEXT: vxor v4, v4, v4 +; P8LE-NEXT: vmladduhm v3, v3, v5, v4 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: fold_urem_vec_1: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: lis r7, 16727 -; P8BE-NEXT: lis r9, 21399 -; P8BE-NEXT: lis r10, 8456 -; P8BE-NEXT: ori r3, r3, 8969 -; P8BE-NEXT: ori r7, r7, 2287 -; P8BE-NEXT: ori r9, r9, 33437 -; P8BE-NEXT: ori r10, r10, 16913 -; P8BE-NEXT: rldicl r6, r4, 16, 48 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: clrlwi r6, r6, 16 -; P8BE-NEXT: rldicl r8, r4, 48, 48 -; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: mulhwu r3, r6, r3 -; P8BE-NEXT: rldicl r4, r4, 32, 48 -; P8BE-NEXT: clrlwi r8, r8, 16 -; P8BE-NEXT: mulhwu r7, r5, r7 -; P8BE-NEXT: rlwinm r11, r4, 30, 18, 31 -; P8BE-NEXT: clrlwi r4, r4, 16 -; P8BE-NEXT: mulhwu r9, r8, r9 -; P8BE-NEXT: mulhwu r10, r11, r10 -; P8BE-NEXT: sub r11, r6, r3 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: srwi r7, r7, 8 -; P8BE-NEXT: add r3, r11, r3 -; P8BE-NEXT: srwi r9, r9, 5 -; P8BE-NEXT: srwi r10, r10, 2 -; P8BE-NEXT: mulli r7, r7, 1003 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: mulli r9, r9, 98 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: mulli r10, r10, 124 -; P8BE-NEXT: sub r5, r5, r7 -; P8BE-NEXT: sub r7, r8, r9 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: sub r3, r6, r3 -; P8BE-NEXT: sub r4, r4, r10 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: sldi r5, r7, 48 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v3, r5 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v4, v5 -; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P8BE-NEXT: xxlxor v5, v5, v5 +; P8BE-NEXT: vspltisw v0, 8 +; P8BE-NEXT: addi r3, r3, .LCPI0_0@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_1@toc@l +; P8BE-NEXT: vadduwm v0, v0, v0 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_2@toc@ha +; P8BE-NEXT: vsrh v3, v2, v3 +; P8BE-NEXT: addi r3, r3, .LCPI0_2@toc@l +; P8BE-NEXT: lxvw4x v1, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_3@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_3@toc@l +; P8BE-NEXT: vperm v3, v5, v3, v4 +; P8BE-NEXT: vmuluwm v3, v3, v1 +; P8BE-NEXT: vsrw v3, v3, v0 +; P8BE-NEXT: vpkuwum v1, v3, v3 +; P8BE-NEXT: vsubuhm v1, v2, v1 +; P8BE-NEXT: vperm v4, v5, v1, v4 +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_4@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_4@toc@l +; P8BE-NEXT: vmuluwm v4, v4, v5 +; P8BE-NEXT: vsrw v4, v4, v0 +; P8BE-NEXT: vadduhm v3, v4, v3 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI0_5@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI0_5@toc@l +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: vsrh v3, v3, v4 +; P8BE-NEXT: vxor v4, v4, v4 +; P8BE-NEXT: vmladduhm v3, v3, v5, v4 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -222,217 +169,88 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; P9LE-LABEL: fold_urem_vec_2: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, 22765 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 8969 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r5, r3, r4 -; P9LE-NEXT: sub r6, r3, r5 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r5, r6, r5 -; P9LE-NEXT: srwi r5, r5, 6 -; P9LE-NEXT: mulli r5, r5, 95 -; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r5, r3, r4 -; P9LE-NEXT: sub r6, r3, r5 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r5, r6, r5 -; P9LE-NEXT: srwi r5, r5, 6 -; P9LE-NEXT: mulli r5, r5, 95 -; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r5, r3, r4 -; P9LE-NEXT: sub r6, r3, r5 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r5, r6, r5 -; P9LE-NEXT: srwi r5, r5, 6 -; P9LE-NEXT: mulli r5, r5, 95 -; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: sub r5, r3, r4 -; P9LE-NEXT: srwi r5, r5, 1 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P9LE-NEXT: xxlxor v3, v3, v3 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: vmrglh v3, v3, v2 +; P9LE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P9LE-NEXT: vmuluwm v3, v3, v4 +; P9LE-NEXT: vspltisw v4, 11 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: fold_urem_vec_2: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r4, 22765 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 8969 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r5, r3, r4 -; P9BE-NEXT: sub r6, r3, r5 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r5, r6, r5 -; P9BE-NEXT: srwi r5, r5, 6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: sub r3, r3, r5 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r5, r3, r4 -; P9BE-NEXT: sub r6, r3, r5 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r5, r6, r5 -; P9BE-NEXT: srwi r5, r5, 6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: sub r3, r3, r5 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r5, r3, r4 -; P9BE-NEXT: sub r6, r3, r5 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r5, r6, r5 -; P9BE-NEXT: srwi r5, r5, 6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: sub r3, r3, r5 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: sub r5, r3, r4 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P9BE-NEXT: xxlxor v4, v4, v4 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P9BE-NEXT: vperm v3, v4, v2, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI1_2@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI1_2@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v4 +; P9BE-NEXT: vspltisw v4, 11 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: fold_urem_vec_2: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: ori r3, r3, 8969 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: clrldi r5, r4, 48 -; P8LE-NEXT: rldicl r6, r4, 48, 48 -; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: rldicl r7, r4, 32, 48 -; P8LE-NEXT: clrlwi r6, r6, 16 -; P8LE-NEXT: mulhwu r8, r5, r3 -; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: clrlwi r7, r7, 16 -; P8LE-NEXT: mulhwu r9, r6, r3 -; P8LE-NEXT: clrlwi r4, r4, 16 -; P8LE-NEXT: mulhwu r10, r7, r3 -; P8LE-NEXT: mulhwu r3, r4, r3 -; P8LE-NEXT: sub r11, r5, r8 -; P8LE-NEXT: sub r12, r6, r9 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: add r8, r11, r8 -; P8LE-NEXT: sub r11, r7, r10 -; P8LE-NEXT: srwi r12, r12, 1 -; P8LE-NEXT: add r9, r12, r9 -; P8LE-NEXT: sub r12, r4, r3 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: srwi r8, r8, 6 -; P8LE-NEXT: add r10, r11, r10 -; P8LE-NEXT: srwi r11, r12, 1 -; P8LE-NEXT: srwi r9, r9, 6 -; P8LE-NEXT: add r3, r11, r3 -; P8LE-NEXT: mulli r8, r8, 95 -; P8LE-NEXT: srwi r10, r10, 6 -; P8LE-NEXT: srwi r3, r3, 6 -; P8LE-NEXT: mulli r9, r9, 95 -; P8LE-NEXT: mulli r10, r10, 95 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: sub r5, r5, r8 -; P8LE-NEXT: sub r6, r6, r9 -; P8LE-NEXT: mtvsrd v2, r5 -; P8LE-NEXT: sub r5, r7, r10 -; P8LE-NEXT: sub r3, r4, r3 -; P8LE-NEXT: mtvsrd v3, r6 -; P8LE-NEXT: mtvsrd v4, r5 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: vmrghh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: xxlxor v3, v3, v3 +; P8LE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P8LE-NEXT: vspltisw v4, 11 +; P8LE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P8LE-NEXT: vmrglh v3, v3, v2 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: vxor v4, v4, v4 +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vmladduhm v3, v3, v5, v4 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: fold_urem_vec_2: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: ori r3, r3, 8969 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: clrlwi r6, r6, 16 -; P8BE-NEXT: mulhwu r8, r5, r3 -; P8BE-NEXT: rldicl r4, r4, 16, 48 -; P8BE-NEXT: clrlwi r7, r7, 16 -; P8BE-NEXT: mulhwu r9, r6, r3 -; P8BE-NEXT: clrlwi r4, r4, 16 -; P8BE-NEXT: mulhwu r10, r7, r3 -; P8BE-NEXT: mulhwu r3, r4, r3 -; P8BE-NEXT: sub r11, r5, r8 -; P8BE-NEXT: sub r12, r6, r9 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: sub r11, r7, r10 -; P8BE-NEXT: srwi r12, r12, 1 -; P8BE-NEXT: add r9, r12, r9 -; P8BE-NEXT: sub r12, r4, r3 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: srwi r8, r8, 6 -; P8BE-NEXT: add r10, r11, r10 -; P8BE-NEXT: srwi r11, r12, 1 -; P8BE-NEXT: srwi r9, r9, 6 -; P8BE-NEXT: add r3, r11, r3 -; P8BE-NEXT: srwi r10, r10, 6 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: mulli r8, r8, 95 -; P8BE-NEXT: mulli r9, r9, 95 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: sub r5, r5, r8 -; P8BE-NEXT: sub r6, r6, r9 -; P8BE-NEXT: sub r7, r7, r10 -; P8BE-NEXT: sub r3, r4, r3 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: sldi r4, r7, 48 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v3, r6 -; P8BE-NEXT: mtvsrd v4, r4 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v5, v4 -; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P8BE-NEXT: xxlxor v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI1_0@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI1_1@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI1_2@toc@ha +; P8BE-NEXT: vperm v3, v4, v2, v3 +; P8BE-NEXT: vspltisw v4, 11 +; P8BE-NEXT: addi r3, r3, .LCPI1_2@toc@l +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: vxor v4, v4, v4 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vmladduhm v3, v3, v5, v4 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -443,259 +261,92 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; P9LE-LABEL: combine_urem_udiv: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, 22765 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 8969 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r5, r3, r4 -; P9LE-NEXT: sub r6, r3, r5 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r5, r6, r5 -; P9LE-NEXT: srwi r5, r5, 6 -; P9LE-NEXT: mulli r6, r5, 95 -; P9LE-NEXT: sub r3, r3, r6 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r6, r3, 16 -; P9LE-NEXT: mulhwu r7, r6, r4 -; P9LE-NEXT: sub r6, r6, r7 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r6, r6, r7 -; P9LE-NEXT: srwi r6, r6, 6 -; P9LE-NEXT: mulli r7, r6, 95 -; P9LE-NEXT: sub r3, r3, r7 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r7, r3, 16 -; P9LE-NEXT: mulhwu r8, r7, r4 -; P9LE-NEXT: sub r7, r7, r8 -; P9LE-NEXT: srwi r7, r7, 1 -; P9LE-NEXT: add r7, r7, r8 -; P9LE-NEXT: srwi r7, r7, 6 -; P9LE-NEXT: mulli r8, r7, 95 -; P9LE-NEXT: sub r3, r3, r8 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r8, r3, 16 -; P9LE-NEXT: mulhwu r4, r8, r4 -; P9LE-NEXT: sub r8, r8, r4 -; P9LE-NEXT: srwi r8, r8, 1 -; P9LE-NEXT: add r4, r8, r4 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r8, r4, 95 -; P9LE-NEXT: mtvsrd v5, r4 -; P9LE-NEXT: sub r3, r3, r8 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: mtvsrd v4, r6 -; P9LE-NEXT: vmrglw v2, v2, v3 -; P9LE-NEXT: mtvsrd v3, r5 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: mtvsrd v4, r7 -; P9LE-NEXT: vmrghh v4, v5, v4 -; P9LE-NEXT: vmrglw v3, v4, v3 +; P9LE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: vspltisw v5, 11 +; P9LE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P9LE-NEXT: vmrglh v4, v4, v2 +; P9LE-NEXT: vadduwm v5, v5, v5 +; P9LE-NEXT: lxvx v3, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P9LE-NEXT: vmuluwm v3, v4, v3 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vsrw v3, v3, v5 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: vmladduhm v4, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v4 ; P9LE-NEXT: vadduhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: combine_urem_udiv: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r5, 22765 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r5, r5, 8969 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: mulhwu r6, r4, r5 -; P9BE-NEXT: sub r4, r4, r6 -; P9BE-NEXT: srwi r4, r4, 1 -; P9BE-NEXT: add r4, r4, r6 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r6, r4, 95 -; P9BE-NEXT: sub r3, r3, r6 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r6, r3, 16 -; P9BE-NEXT: mulhwu r7, r6, r5 -; P9BE-NEXT: sub r6, r6, r7 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r6, r6, r7 -; P9BE-NEXT: srwi r6, r6, 6 -; P9BE-NEXT: mulli r7, r6, 95 -; P9BE-NEXT: sub r3, r3, r7 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrlwi r7, r3, 16 -; P9BE-NEXT: mulhwu r8, r7, r5 -; P9BE-NEXT: sub r7, r7, r8 -; P9BE-NEXT: srwi r7, r7, 1 -; P9BE-NEXT: add r7, r7, r8 -; P9BE-NEXT: srwi r7, r7, 6 -; P9BE-NEXT: mulli r8, r7, 95 -; P9BE-NEXT: sub r3, r3, r8 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r5, r3, r5 -; P9BE-NEXT: sub r8, r3, r5 -; P9BE-NEXT: srwi r8, r8, 1 -; P9BE-NEXT: add r5, r8, r5 -; P9BE-NEXT: srwi r5, r5, 6 -; P9BE-NEXT: mulli r8, r5, 95 -; P9BE-NEXT: sub r3, r3, r8 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: sldi r3, r4, 48 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v2, v3 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: sldi r3, r6, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: sldi r3, r7, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: sldi r3, r5, 48 -; P9BE-NEXT: mtvsrd v5, r3 -; P9BE-NEXT: vmrghh v4, v5, v4 -; P9BE-NEXT: vmrghw v3, v4, v3 +; P9BE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P9BE-NEXT: xxlxor v4, v4, v4 +; P9BE-NEXT: vspltisw v5, 11 +; P9BE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P9BE-NEXT: vadduwm v5, v5, v5 +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI2_2@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI2_2@toc@l +; P9BE-NEXT: vperm v3, v4, v2, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vsrw v3, v3, v5 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: vmladduhm v4, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v4 ; P9BE-NEXT: vadduhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: combine_urem_udiv: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; P8LE-NEXT: ori r3, r3, 8969 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: clrldi r5, r4, 48 -; P8LE-NEXT: rldicl r6, r4, 48, 48 -; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: clrlwi r8, r6, 16 -; P8LE-NEXT: rldicl r7, r4, 32, 48 -; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: mulhwu r9, r5, r3 -; P8LE-NEXT: mulhwu r11, r8, r3 -; P8LE-NEXT: clrlwi r10, r7, 16 -; P8LE-NEXT: clrlwi r12, r4, 16 -; P8LE-NEXT: mulhwu r0, r10, r3 -; P8LE-NEXT: mulhwu r3, r12, r3 -; P8LE-NEXT: sub r30, r5, r9 -; P8LE-NEXT: sub r8, r8, r11 -; P8LE-NEXT: srwi r30, r30, 1 -; P8LE-NEXT: srwi r8, r8, 1 -; P8LE-NEXT: sub r10, r10, r0 -; P8LE-NEXT: add r9, r30, r9 -; P8LE-NEXT: add r8, r8, r11 -; P8LE-NEXT: sub r11, r12, r3 -; P8LE-NEXT: srwi r10, r10, 1 -; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; P8LE-NEXT: srwi r9, r9, 6 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: srwi r8, r8, 6 -; P8LE-NEXT: add r10, r10, r0 -; P8LE-NEXT: mulli r12, r9, 95 -; P8LE-NEXT: add r3, r11, r3 -; P8LE-NEXT: mtvsrd v2, r9 -; P8LE-NEXT: srwi r10, r10, 6 -; P8LE-NEXT: mulli r9, r8, 95 -; P8LE-NEXT: srwi r3, r3, 6 -; P8LE-NEXT: mtvsrd v3, r8 -; P8LE-NEXT: mulli r8, r10, 95 -; P8LE-NEXT: mtvsrd v4, r10 -; P8LE-NEXT: mulli r10, r3, 95 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: sub r5, r5, r12 -; P8LE-NEXT: sub r6, r6, r9 -; P8LE-NEXT: mtvsrd v3, r5 -; P8LE-NEXT: mtvsrd v5, r6 -; P8LE-NEXT: sub r5, r7, r8 -; P8LE-NEXT: sub r4, r4, r10 -; P8LE-NEXT: mtvsrd v0, r5 -; P8LE-NEXT: mtvsrd v1, r4 -; P8LE-NEXT: vmrghh v3, v5, v3 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v0, v1, v0 -; P8LE-NEXT: vmrghh v4, v5, v4 -; P8LE-NEXT: vmrglw v3, v0, v3 -; P8LE-NEXT: vmrglw v2, v4, v2 -; P8LE-NEXT: vadduhm v2, v3, v2 +; P8LE-NEXT: xxlxor v3, v3, v3 +; P8LE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P8LE-NEXT: vspltisw v4, 11 +; P8LE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P8LE-NEXT: vmrglh v3, v3, v2 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: vxor v4, v4, v4 +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vmladduhm v4, v3, v5, v4 +; P8LE-NEXT: vsubuhm v2, v2, v4 +; P8LE-NEXT: vadduhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: combine_urem_udiv: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r5, v2 -; P8BE-NEXT: lis r4, 22765 -; P8BE-NEXT: ori r4, r4, 8969 -; P8BE-NEXT: clrldi r3, r5, 48 -; P8BE-NEXT: rldicl r6, r5, 48, 48 -; P8BE-NEXT: clrlwi r8, r3, 16 -; P8BE-NEXT: rldicl r7, r5, 32, 48 -; P8BE-NEXT: clrlwi r9, r6, 16 -; P8BE-NEXT: rldicl r5, r5, 16, 48 -; P8BE-NEXT: mulhwu r10, r8, r4 -; P8BE-NEXT: clrlwi r11, r7, 16 -; P8BE-NEXT: mulhwu r12, r9, r4 -; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: mulhwu r0, r11, r4 -; P8BE-NEXT: mulhwu r4, r5, r4 -; P8BE-NEXT: sub r8, r8, r10 -; P8BE-NEXT: sub r9, r9, r12 -; P8BE-NEXT: srwi r8, r8, 1 -; P8BE-NEXT: add r8, r8, r10 -; P8BE-NEXT: sub r10, r11, r0 -; P8BE-NEXT: srwi r9, r9, 1 -; P8BE-NEXT: sub r11, r5, r4 -; P8BE-NEXT: add r9, r9, r12 -; P8BE-NEXT: srwi r8, r8, 6 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: srwi r10, r10, 1 -; P8BE-NEXT: srwi r9, r9, 6 -; P8BE-NEXT: mulli r12, r8, 95 -; P8BE-NEXT: add r4, r11, r4 -; P8BE-NEXT: add r10, r10, r0 -; P8BE-NEXT: mulli r11, r9, 95 -; P8BE-NEXT: srwi r4, r4, 6 -; P8BE-NEXT: srwi r10, r10, 6 -; P8BE-NEXT: sldi r9, r9, 48 -; P8BE-NEXT: sldi r8, r8, 48 -; P8BE-NEXT: mtvsrd v3, r9 -; P8BE-NEXT: mulli r9, r4, 95 -; P8BE-NEXT: mtvsrd v2, r8 -; P8BE-NEXT: mulli r8, r10, 95 -; P8BE-NEXT: sub r3, r3, r12 -; P8BE-NEXT: sub r6, r6, r11 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: sldi r10, r10, 48 -; P8BE-NEXT: mtvsrd v3, r3 -; P8BE-NEXT: sub r3, r5, r9 -; P8BE-NEXT: sub r7, r7, r8 -; P8BE-NEXT: mtvsrd v5, r6 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sldi r5, r7, 48 -; P8BE-NEXT: mtvsrd v1, r3 -; P8BE-NEXT: sldi r3, r4, 48 -; P8BE-NEXT: mtvsrd v4, r10 -; P8BE-NEXT: mtvsrd v0, r5 -; P8BE-NEXT: vmrghh v3, v5, v3 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v0, v1, v0 -; P8BE-NEXT: vmrghh v4, v5, v4 -; P8BE-NEXT: vmrghw v3, v0, v3 -; P8BE-NEXT: vmrghw v2, v4, v2 -; P8BE-NEXT: vadduhm v2, v3, v2 +; P8BE-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; P8BE-NEXT: xxlxor v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI2_1@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI2_2@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI2_2@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P8BE-NEXT: vperm v3, v4, v2, v3 +; P8BE-NEXT: vspltisw v4, 11 +; P8BE-NEXT: addi r3, r3, .LCPI2_0@toc@l +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: vxor v4, v4, v4 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vmladduhm v4, v3, v5, v4 +; P8BE-NEXT: vsubuhm v2, v2, v4 +; P8BE-NEXT: vadduhm v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, @@ -707,133 +358,104 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_urem_power_of_two: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, 22765 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 8969 -; P9LE-NEXT: clrlwi r3, r3, 26 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 27 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: sub r5, r3, r4 -; P9LE-NEXT: srwi r5, r5, 1 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 29 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v4, v2 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P9LE-NEXT: xxlxor v3, v3, v3 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: vmrglh v3, v3, v2 +; P9LE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P9LE-NEXT: vmuluwm v3, v3, v4 +; P9LE-NEXT: vspltisw v4, 8 +; P9LE-NEXT: vadduwm v4, v4, v4 +; P9LE-NEXT: vsrw v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P9LE-NEXT: vsrh v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_urem_power_of_two: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: lis r4, 22765 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 8969 -; P9BE-NEXT: clrlwi r3, r3, 27 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 26 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: sub r5, r3, r4 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 29 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P9BE-NEXT: xxlxor v4, v4, v4 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P9BE-NEXT: vperm v3, v4, v2, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v4 +; P9BE-NEXT: vspltisw v4, 8 +; P9BE-NEXT: vadduwm v4, v4, v4 +; P9BE-NEXT: vsrw v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI3_3@toc@ha +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: addi r3, r3, .LCPI3_3@toc@l +; P9BE-NEXT: vsrh v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_urem_power_of_two: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: ori r3, r3, 8969 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: rldicl r5, r4, 16, 48 -; P8LE-NEXT: rldicl r7, r4, 48, 48 -; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: mulhwu r3, r5, r3 -; P8LE-NEXT: sub r6, r5, r3 -; P8LE-NEXT: srwi r6, r6, 1 -; P8LE-NEXT: add r3, r6, r3 -; P8LE-NEXT: clrldi r6, r4, 48 -; P8LE-NEXT: srwi r3, r3, 6 -; P8LE-NEXT: clrlwi r6, r6, 26 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: rldicl r4, r4, 32, 48 -; P8LE-NEXT: mtvsrd v2, r6 -; P8LE-NEXT: clrlwi r6, r7, 27 -; P8LE-NEXT: clrlwi r4, r4, 29 -; P8LE-NEXT: mtvsrd v3, r6 -; P8LE-NEXT: mtvsrd v5, r4 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: sub r3, r5, r3 -; P8LE-NEXT: mtvsrd v4, r3 -; P8LE-NEXT: vmrghh v3, v4, v5 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: xxlxor v3, v3, v3 +; P8LE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P8LE-NEXT: vspltisw v4, 8 +; P8LE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P8LE-NEXT: vmrglh v3, v3, v2 +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P8LE-NEXT: vadduwm v4, v4, v4 +; P8LE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P8LE-NEXT: vmuluwm v3, v3, v5 +; P8LE-NEXT: vsrw v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P8LE-NEXT: lvx v5, 0, r3 +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vsrh v3, v3, v4 +; P8LE-NEXT: vxor v4, v4, v4 +; P8LE-NEXT: vmladduhm v3, v3, v5, v4 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_power_of_two: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: ori r3, r3, 8969 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r7, r4, 16, 48 -; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: clrlwi r7, r7, 26 -; P8BE-NEXT: mulhwu r3, r5, r3 -; P8BE-NEXT: sub r6, r5, r3 -; P8BE-NEXT: srwi r6, r6, 1 -; P8BE-NEXT: add r3, r6, r3 -; P8BE-NEXT: rldicl r6, r4, 32, 48 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: rldicl r4, r4, 48, 48 -; P8BE-NEXT: clrlwi r6, r6, 27 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: clrlwi r4, r4, 29 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: sldi r6, r7, 48 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v3, r6 -; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: sub r3, r5, r3 -; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: vmrghh v3, v5, v4 -; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P8BE-NEXT: xxlxor v4, v4, v4 +; P8BE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI3_2@toc@ha +; P8BE-NEXT: vperm v3, v4, v2, v3 +; P8BE-NEXT: vspltisw v4, 8 +; P8BE-NEXT: addi r3, r3, .LCPI3_2@toc@l +; P8BE-NEXT: vadduwm v4, v4, v4 +; P8BE-NEXT: vmuluwm v3, v3, v5 +; P8BE-NEXT: vsrw v3, v3, v4 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI3_3@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI3_3@toc@l +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vsrh v3, v3, v4 +; P8BE-NEXT: vxor v4, v4, v4 +; P8BE-NEXT: vmladduhm v3, v3, v5, v4 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -843,167 +465,156 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_urem_one: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: lis r4, -19946 -; P9LE-NEXT: lis r5, -14230 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 17097 -; P9LE-NEXT: ori r5, r5, 30865 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: srwi r4, r4, 4 -; P9LE-NEXT: mulli r4, r4, 23 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 24749 -; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 47143 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: srwi r4, r4, 11 -; P9LE-NEXT: mulli r4, r4, 5423 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r4, r3, 16 -; P9LE-NEXT: rlwinm r3, r3, 31, 17, 31 -; P9LE-NEXT: mulhwu r3, r3, r5 -; P9LE-NEXT: srwi r3, r3, 8 -; P9LE-NEXT: mulli r3, r3, 654 -; P9LE-NEXT: sub r3, r4, r3 -; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: vmrghh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P9LE-NEXT: xxlxor v3, v3, v3 +; P9LE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P9LE-NEXT: vmrglh v4, v3, v2 +; P9LE-NEXT: lxvx v5, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P9LE-NEXT: vmuluwm v4, v4, v5 +; P9LE-NEXT: vspltisw v5, 8 +; P9LE-NEXT: vadduwm v5, v5, v5 +; P9LE-NEXT: vsrw v4, v4, v5 +; P9LE-NEXT: vpkuwum v0, v4, v4 +; P9LE-NEXT: vsubuhm v0, v2, v0 +; P9LE-NEXT: vmrglh v3, v3, v0 +; P9LE-NEXT: lxvx v0, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P9LE-NEXT: vmuluwm v3, v3, v0 +; P9LE-NEXT: vsrw v3, v3, v5 +; P9LE-NEXT: vxor v5, v5, v5 +; P9LE-NEXT: vadduhm v3, v3, v4 +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P9LE-NEXT: vpkuwum v3, v3, v3 +; P9LE-NEXT: lxvx vs0, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI4_4@toc@ha +; P9LE-NEXT: vsrh v3, v3, v4 +; P9LE-NEXT: addi r3, r3, .LCPI4_4@toc@l +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: xxsel v3, v3, v2, vs0 +; P9LE-NEXT: vmladduhm v3, v3, v4, v5 +; P9LE-NEXT: vsubuhm v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_urem_one: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r4, 24749 -; P9BE-NEXT: lis r5, -14230 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 47143 -; P9BE-NEXT: ori r5, r5, 30865 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: srwi r4, r4, 11 -; P9BE-NEXT: mulli r4, r4, 5423 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, -19946 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: ori r4, r4, 17097 -; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: srwi r4, r4, 4 -; P9BE-NEXT: mulli r4, r4, 23 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31 -; P9BE-NEXT: mulhwu r3, r3, r5 -; P9BE-NEXT: srwi r3, r3, 8 -; P9BE-NEXT: mulli r3, r3, 654 -; P9BE-NEXT: sub r3, r4, r3 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: vmrghh v2, v4, v2 -; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P9BE-NEXT: xxlxor v4, v4, v4 +; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P9BE-NEXT: lxvx v3, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P9BE-NEXT: lxvx v0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P9BE-NEXT: vperm v5, v4, v2, v3 +; P9BE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P9BE-NEXT: vmuluwm v5, v5, v0 +; P9BE-NEXT: vspltisw v0, 8 +; P9BE-NEXT: vadduwm v0, v0, v0 +; P9BE-NEXT: vsrw v5, v5, v0 +; P9BE-NEXT: vpkuwum v1, v5, v5 +; P9BE-NEXT: vsubuhm v1, v2, v1 +; P9BE-NEXT: vperm v3, v4, v1, v3 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P9BE-NEXT: vmuluwm v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_4@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_4@toc@l +; P9BE-NEXT: lxvx vs0, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_5@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_5@toc@l +; P9BE-NEXT: vsrw v3, v3, v0 +; P9BE-NEXT: vadduhm v3, v3, v5 +; P9BE-NEXT: vxor v5, v5, v5 +; P9BE-NEXT: vpkuwum v3, v3, v3 +; P9BE-NEXT: vsrh v3, v3, v4 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: xxsel v3, v3, v2, vs0 +; P9BE-NEXT: vmladduhm v3, v3, v4, v5 +; P9BE-NEXT: vsubuhm v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_urem_one: ; P8LE: # %bb.0: -; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, -14230 -; P8LE-NEXT: lis r7, -19946 -; P8LE-NEXT: lis r9, 24749 -; P8LE-NEXT: ori r3, r3, 30865 -; P8LE-NEXT: ori r7, r7, 17097 -; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: rldicl r5, r4, 48, 48 -; P8LE-NEXT: rldicl r6, r4, 32, 48 -; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: rlwinm r8, r5, 31, 17, 31 -; P8LE-NEXT: clrlwi r6, r6, 16 -; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: mulhwu r3, r8, r3 -; P8LE-NEXT: ori r8, r9, 47143 -; P8LE-NEXT: clrlwi r4, r4, 16 -; P8LE-NEXT: li r9, 0 -; P8LE-NEXT: mulhwu r7, r6, r7 -; P8LE-NEXT: mulhwu r8, r4, r8 -; P8LE-NEXT: mtvsrd v2, r9 -; P8LE-NEXT: srwi r3, r3, 8 -; P8LE-NEXT: srwi r7, r7, 4 -; P8LE-NEXT: mulli r3, r3, 654 -; P8LE-NEXT: srwi r8, r8, 11 -; P8LE-NEXT: mulli r7, r7, 23 -; P8LE-NEXT: mulli r8, r8, 5423 -; P8LE-NEXT: sub r3, r5, r3 -; P8LE-NEXT: sub r5, r6, r7 -; P8LE-NEXT: mtvsrd v3, r3 -; P8LE-NEXT: sub r3, r4, r8 -; P8LE-NEXT: mtvsrd v4, r5 -; P8LE-NEXT: mtvsrd v5, r3 -; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: vmrghh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: xxlxor v3, v3, v3 +; P8LE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P8LE-NEXT: vspltisw v5, 8 +; P8LE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P8LE-NEXT: vmrglh v4, v3, v2 +; P8LE-NEXT: lvx v0, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P8LE-NEXT: vadduwm v5, v5, v5 +; P8LE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P8LE-NEXT: vmuluwm v4, v4, v0 +; P8LE-NEXT: vsrw v4, v4, v5 +; P8LE-NEXT: vpkuwum v0, v4, v4 +; P8LE-NEXT: vsubuhm v0, v2, v0 +; P8LE-NEXT: vmrglh v3, v3, v0 +; P8LE-NEXT: lvx v0, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P8LE-NEXT: vmuluwm v3, v3, v0 +; P8LE-NEXT: vsrw v3, v3, v5 +; P8LE-NEXT: vxor v5, v5, v5 +; P8LE-NEXT: vadduhm v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P8LE-NEXT: vpkuwum v3, v3, v3 +; P8LE-NEXT: vsrh v3, v3, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: addis r3, r2, .LCPI4_4@toc@ha +; P8LE-NEXT: addi r3, r3, .LCPI4_4@toc@l +; P8LE-NEXT: xxsel v3, v3, v2, v4 +; P8LE-NEXT: lvx v4, 0, r3 +; P8LE-NEXT: vmladduhm v3, v3, v4, v5 +; P8LE-NEXT: vsubuhm v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_one: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 24749 -; P8BE-NEXT: lis r7, -19946 -; P8BE-NEXT: lis r8, -14230 -; P8BE-NEXT: ori r3, r3, 47143 -; P8BE-NEXT: ori r7, r7, 17097 -; P8BE-NEXT: ori r8, r8, 30865 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: rldicl r4, r4, 32, 48 -; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: clrlwi r6, r6, 16 -; P8BE-NEXT: mulhwu r3, r5, r3 -; P8BE-NEXT: rlwinm r9, r4, 31, 17, 31 -; P8BE-NEXT: clrlwi r4, r4, 16 -; P8BE-NEXT: mulhwu r7, r6, r7 -; P8BE-NEXT: mulhwu r8, r9, r8 -; P8BE-NEXT: li r9, 0 -; P8BE-NEXT: srwi r3, r3, 11 -; P8BE-NEXT: srwi r7, r7, 4 -; P8BE-NEXT: mulli r3, r3, 5423 -; P8BE-NEXT: srwi r8, r8, 8 -; P8BE-NEXT: mulli r7, r7, 23 -; P8BE-NEXT: mulli r8, r8, 654 -; P8BE-NEXT: sub r3, r5, r3 -; P8BE-NEXT: sldi r5, r9, 48 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: sub r5, r6, r7 -; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sub r4, r4, r8 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: mtvsrd v3, r3 -; P8BE-NEXT: sldi r3, r4, 48 -; P8BE-NEXT: mtvsrd v4, r5 -; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v3, v4, v3 -; P8BE-NEXT: vmrghh v2, v2, v5 -; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P8BE-NEXT: xxlxor v4, v4, v4 +; P8BE-NEXT: vspltisw v1, 8 +; P8BE-NEXT: addi r3, r3, .LCPI4_0@toc@l +; P8BE-NEXT: lxvw4x v3, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_1@toc@l +; P8BE-NEXT: lxvw4x v0, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; P8BE-NEXT: vperm v5, v4, v2, v3 +; P8BE-NEXT: addi r3, r3, .LCPI4_2@toc@l +; P8BE-NEXT: vmuluwm v5, v5, v0 +; P8BE-NEXT: vadduwm v0, v1, v1 +; P8BE-NEXT: vsrw v5, v5, v0 +; P8BE-NEXT: vpkuwum v1, v5, v5 +; P8BE-NEXT: vsubuhm v1, v2, v1 +; P8BE-NEXT: vperm v3, v4, v1, v3 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_3@toc@l +; P8BE-NEXT: vmuluwm v3, v3, v4 +; P8BE-NEXT: lxvw4x v4, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_4@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_4@toc@l +; P8BE-NEXT: vsrw v3, v3, v0 +; P8BE-NEXT: lxvw4x vs0, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI4_5@toc@ha +; P8BE-NEXT: addi r3, r3, .LCPI4_5@toc@l +; P8BE-NEXT: vadduhm v3, v3, v5 +; P8BE-NEXT: lxvw4x v5, 0, r3 +; P8BE-NEXT: vpkuwum v3, v3, v3 +; P8BE-NEXT: vsrh v3, v3, v4 +; P8BE-NEXT: vxor v4, v4, v4 +; P8BE-NEXT: xxsel v3, v3, v2, vs0 +; P8BE-NEXT: vmladduhm v3, v3, v5, v4 +; P8BE-NEXT: vsubuhm v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -1022,129 +633,251 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { ; P9LE-LABEL: dont_fold_urem_i64: ; P9LE: # %bb.0: -; P9LE-NEXT: lis r4, 25644 -; P9LE-NEXT: mfvsrld r3, v3 -; P9LE-NEXT: ori r4, r4, 34192 +; P9LE-NEXT: lis r3, 25644 +; P9LE-NEXT: mfvsrld r4, v3 +; P9LE-NEXT: xxlxor v5, v5, v5 +; P9LE-NEXT: ori r3, r3, 34192 +; P9LE-NEXT: sldi r3, r3, 32 +; P9LE-NEXT: oris r3, r3, 45590 +; P9LE-NEXT: ori r3, r3, 17097 +; P9LE-NEXT: mulld r5, r4, r3 +; P9LE-NEXT: mulhdu r3, r4, r3 +; P9LE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; P9LE-NEXT: addi r4, r4, .LCPI6_0@toc@l +; P9LE-NEXT: lxvx v0, 0, r4 +; P9LE-NEXT: mtvsrdd v4, r3, r5 +; P9LE-NEXT: lis r3, -16037 +; P9LE-NEXT: mfvsrd r5, v3 +; P9LE-NEXT: ori r3, r3, 28749 +; P9LE-NEXT: sldi r3, r3, 32 +; P9LE-NEXT: vspltb v1, v0, 15 +; P9LE-NEXT: vsro v4, v4, v0 +; P9LE-NEXT: vsro v5, v5, v0 +; P9LE-NEXT: oris r3, r3, 52170 +; P9LE-NEXT: vsr v4, v4, v1 +; P9LE-NEXT: vsr v5, v5, v1 +; P9LE-NEXT: ori r3, r3, 12109 +; P9LE-NEXT: mulld r4, r5, r3 +; P9LE-NEXT: mulhdu r3, r5, r3 +; P9LE-NEXT: mtvsrdd v6, r3, r4 +; P9LE-NEXT: vsro v6, v6, v0 +; P9LE-NEXT: vsr v6, v6, v1 +; P9LE-NEXT: xxmrgld v6, v6, v4 +; P9LE-NEXT: vsubudm v4, v3, v4 +; P9LE-NEXT: mfvsrld r3, v4 +; P9LE-NEXT: sldi r4, r3, 63 +; P9LE-NEXT: rldicl r3, r3, 63, 1 +; P9LE-NEXT: mtvsrdd v4, r3, r4 +; P9LE-NEXT: addis r3, r2, .LCPI6_1@toc@ha +; P9LE-NEXT: vsro v4, v4, v0 +; P9LE-NEXT: addi r3, r3, .LCPI6_1@toc@l +; P9LE-NEXT: vsr v4, v4, v1 +; P9LE-NEXT: xxmrgld v4, v5, v4 +; P9LE-NEXT: vaddudm v4, v4, v6 +; P9LE-NEXT: lxvx v6, 0, r3 +; P9LE-NEXT: vsrd v4, v4, v6 +; P9LE-NEXT: mfvsrld r3, v4 +; P9LE-NEXT: mfvsrd r4, v4 +; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: mulli r3, r3, 23 +; P9LE-NEXT: mtvsrdd v4, r4, r3 +; P9LE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P9LE-NEXT: lis r4, 25653 +; P9LE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P9LE-NEXT: vsubudm v3, v3, v4 +; P9LE-NEXT: ori r4, r4, 15432 +; P9LE-NEXT: lxvx v4, 0, r3 ; P9LE-NEXT: sldi r4, r4, 32 -; P9LE-NEXT: oris r4, r4, 45590 -; P9LE-NEXT: ori r4, r4, 17097 -; P9LE-NEXT: mulhdu r4, r3, r4 -; P9LE-NEXT: sub r5, r3, r4 -; P9LE-NEXT: rldicl r5, r5, 63, 1 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: lis r5, -16037 -; P9LE-NEXT: rldicl r4, r4, 60, 4 -; P9LE-NEXT: ori r5, r5, 28749 -; P9LE-NEXT: mulli r4, r4, 23 -; P9LE-NEXT: sldi r5, r5, 32 -; P9LE-NEXT: oris r5, r5, 52170 -; P9LE-NEXT: ori r5, r5, 12109 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: mfvsrd r4, v3 -; P9LE-NEXT: mulhdu r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 52, 12 -; P9LE-NEXT: mulli r5, r5, 5423 -; P9LE-NEXT: sub r4, r4, r5 -; P9LE-NEXT: lis r5, 25653 -; P9LE-NEXT: ori r5, r5, 15432 -; P9LE-NEXT: mtvsrdd v3, r4, r3 -; P9LE-NEXT: mfvsrd r3, v2 -; P9LE-NEXT: sldi r5, r5, 32 -; P9LE-NEXT: rldicl r4, r3, 63, 1 -; P9LE-NEXT: oris r5, r5, 1603 -; P9LE-NEXT: ori r5, r5, 21445 -; P9LE-NEXT: mulhdu r4, r4, r5 -; P9LE-NEXT: rldicl r4, r4, 57, 7 +; P9LE-NEXT: oris r4, r4, 1603 +; P9LE-NEXT: ori r4, r4, 21445 +; P9LE-NEXT: vsrd v4, v2, v4 +; P9LE-NEXT: mfvsrd r3, v4 +; P9LE-NEXT: mulld r5, r3, r4 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: mtvsrdd v4, r3, r5 +; P9LE-NEXT: addis r3, r2, .LCPI6_3@toc@ha +; P9LE-NEXT: vsro v4, v4, v0 +; P9LE-NEXT: addi r3, r3, .LCPI6_3@toc@l +; P9LE-NEXT: vsr v4, v4, v1 +; P9LE-NEXT: xxmrgld v4, v4, v5 +; P9LE-NEXT: xxspltd v5, v5, 1 +; P9LE-NEXT: xxlor v4, v5, v4 +; P9LE-NEXT: lxvx v5, 0, r3 +; P9LE-NEXT: addis r3, r2, .LCPI6_4@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI6_4@toc@l +; P9LE-NEXT: vsrd v4, v4, v5 +; P9LE-NEXT: lxvx v5, 0, r3 +; P9LE-NEXT: xxsel vs0, v4, v2, v5 +; P9LE-NEXT: mffprd r4, f0 +; P9LE-NEXT: mfvsrld r3, vs0 ; P9LE-NEXT: mulli r4, r4, 654 -; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: li r4, 0 -; P9LE-NEXT: mtvsrdd v2, r3, r4 +; P9LE-NEXT: mtvsrdd v4, r4, r3 +; P9LE-NEXT: vsubudm v2, v2, v4 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_urem_i64: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r3, -16037 +; P9BE-NEXT: mfvsrld r4, v3 +; P9BE-NEXT: xxlxor v5, v5, v5 +; P9BE-NEXT: ori r3, r3, 28749 +; P9BE-NEXT: sldi r3, r3, 32 +; P9BE-NEXT: oris r3, r3, 52170 +; P9BE-NEXT: ori r3, r3, 12109 +; P9BE-NEXT: mulld r5, r4, r3 +; P9BE-NEXT: mulhdu r3, r4, r3 ; P9BE-NEXT: lis r4, 25644 -; P9BE-NEXT: mfvsrd r3, v3 ; P9BE-NEXT: ori r4, r4, 34192 ; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: mtvsrdd v4, r3, r5 +; P9BE-NEXT: mfvsrd r3, v3 ; P9BE-NEXT: oris r4, r4, 45590 ; P9BE-NEXT: ori r4, r4, 17097 -; P9BE-NEXT: mulhdu r4, r3, r4 -; P9BE-NEXT: sub r5, r3, r4 -; P9BE-NEXT: rldicl r5, r5, 63, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: lis r5, -16037 -; P9BE-NEXT: rldicl r4, r4, 60, 4 -; P9BE-NEXT: ori r5, r5, 28749 +; P9BE-NEXT: mulld r5, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; P9BE-NEXT: addi r4, r4, .LCPI6_0@toc@l +; P9BE-NEXT: lxvx v0, 0, r4 +; P9BE-NEXT: vspltb v1, v0, 15 +; P9BE-NEXT: mtvsrdd v6, r3, r5 +; P9BE-NEXT: vsro v4, v4, v0 +; P9BE-NEXT: vsro v6, v6, v0 +; P9BE-NEXT: vsro v5, v5, v0 +; P9BE-NEXT: vsr v4, v4, v1 +; P9BE-NEXT: vsr v6, v6, v1 +; P9BE-NEXT: vsr v5, v5, v1 +; P9BE-NEXT: xxmrgld v4, v6, v4 +; P9BE-NEXT: vsubudm v6, v3, v4 +; P9BE-NEXT: mfvsrd r3, v6 +; P9BE-NEXT: sldi r4, r3, 63 +; P9BE-NEXT: rldicl r3, r3, 63, 1 +; P9BE-NEXT: mtvsrdd v6, r3, r4 +; P9BE-NEXT: addis r3, r2, .LCPI6_1@toc@ha +; P9BE-NEXT: vsro v6, v6, v0 +; P9BE-NEXT: addi r3, r3, .LCPI6_1@toc@l +; P9BE-NEXT: vsr v6, v6, v1 +; P9BE-NEXT: xxmrgld v6, v6, v5 +; P9BE-NEXT: vaddudm v4, v6, v4 +; P9BE-NEXT: lxvx v6, 0, r3 +; P9BE-NEXT: vsrd v4, v4, v6 +; P9BE-NEXT: mfvsrld r3, v4 +; P9BE-NEXT: mfvsrd r4, v4 ; P9BE-NEXT: mulli r4, r4, 23 -; P9BE-NEXT: sldi r5, r5, 32 -; P9BE-NEXT: oris r5, r5, 52170 -; P9BE-NEXT: ori r5, r5, 12109 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: mfvsrld r4, v3 -; P9BE-NEXT: mulhdu r5, r4, r5 -; P9BE-NEXT: rldicl r5, r5, 52, 12 -; P9BE-NEXT: mulli r5, r5, 5423 -; P9BE-NEXT: sub r4, r4, r5 -; P9BE-NEXT: lis r5, 25653 -; P9BE-NEXT: ori r5, r5, 15432 -; P9BE-NEXT: mtvsrdd v3, r3, r4 -; P9BE-NEXT: mfvsrld r3, v2 -; P9BE-NEXT: sldi r5, r5, 32 -; P9BE-NEXT: rldicl r4, r3, 63, 1 -; P9BE-NEXT: oris r5, r5, 1603 -; P9BE-NEXT: ori r5, r5, 21445 -; P9BE-NEXT: mulhdu r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 57, 7 -; P9BE-NEXT: mulli r4, r4, 654 -; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: mtvsrdd v2, 0, r3 +; P9BE-NEXT: mulli r3, r3, 5423 +; P9BE-NEXT: mtvsrdd v4, r4, r3 +; P9BE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P9BE-NEXT: lis r4, 25653 +; P9BE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P9BE-NEXT: vsubudm v3, v3, v4 +; P9BE-NEXT: ori r4, r4, 15432 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 1603 +; P9BE-NEXT: ori r4, r4, 21445 +; P9BE-NEXT: vsrd v4, v2, v4 +; P9BE-NEXT: mfvsrld r3, v4 +; P9BE-NEXT: mulld r5, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: mtvsrdd v4, r3, r5 +; P9BE-NEXT: addis r3, r2, .LCPI6_3@toc@ha +; P9BE-NEXT: vsro v4, v4, v0 +; P9BE-NEXT: addi r3, r3, .LCPI6_3@toc@l +; P9BE-NEXT: vsr v4, v4, v1 +; P9BE-NEXT: xxmrgld v4, v5, v4 +; P9BE-NEXT: xxspltd v5, v5, 1 +; P9BE-NEXT: xxlor v4, v5, v4 +; P9BE-NEXT: lxvx v5, 0, r3 +; P9BE-NEXT: addis r3, r2, .LCPI6_4@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI6_4@toc@l +; P9BE-NEXT: vsrd v4, v4, v5 +; P9BE-NEXT: lxvx v5, 0, r3 +; P9BE-NEXT: xxsel vs0, v4, v2, v5 +; P9BE-NEXT: mfvsrld r3, vs0 +; P9BE-NEXT: mffprd r4, f0 +; P9BE-NEXT: mulli r3, r3, 654 +; P9BE-NEXT: mtvsrdd v4, r4, r3 +; P9BE-NEXT: vsubudm v2, v2, v4 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_urem_i64: ; P8LE: # %bb.0: ; P8LE-NEXT: lis r3, 25644 ; P8LE-NEXT: xxswapd vs0, v3 -; P8LE-NEXT: lis r4, -16037 -; P8LE-NEXT: lis r5, 25653 -; P8LE-NEXT: mfvsrd r6, v2 +; P8LE-NEXT: addis r4, r2, .LCPI6_1@toc@ha +; P8LE-NEXT: xxlxor vs2, vs2, vs2 ; P8LE-NEXT: ori r3, r3, 34192 -; P8LE-NEXT: ori r4, r4, 28749 -; P8LE-NEXT: ori r5, r5, 15432 -; P8LE-NEXT: mfvsrd r8, v3 +; P8LE-NEXT: addi r4, r4, .LCPI6_1@toc@l ; P8LE-NEXT: sldi r3, r3, 32 -; P8LE-NEXT: sldi r4, r4, 32 ; P8LE-NEXT: oris r3, r3, 45590 -; P8LE-NEXT: mffprd r7, f0 -; P8LE-NEXT: sldi r5, r5, 32 -; P8LE-NEXT: oris r4, r4, 52170 +; P8LE-NEXT: mffprd r5, f0 +; P8LE-NEXT: lxvd2x vs0, 0, r4 +; P8LE-NEXT: li r4, 0 ; P8LE-NEXT: ori r3, r3, 17097 -; P8LE-NEXT: oris r5, r5, 1603 -; P8LE-NEXT: ori r4, r4, 12109 -; P8LE-NEXT: mulhdu r3, r7, r3 -; P8LE-NEXT: rldicl r9, r6, 63, 1 -; P8LE-NEXT: ori r5, r5, 21445 -; P8LE-NEXT: mulhdu r4, r8, r4 -; P8LE-NEXT: mulhdu r5, r9, r5 -; P8LE-NEXT: sub r9, r7, r3 -; P8LE-NEXT: rldicl r9, r9, 63, 1 -; P8LE-NEXT: rldicl r4, r4, 52, 12 -; P8LE-NEXT: add r3, r9, r3 -; P8LE-NEXT: rldicl r5, r5, 57, 7 -; P8LE-NEXT: mulli r4, r4, 5423 -; P8LE-NEXT: rldicl r3, r3, 60, 4 -; P8LE-NEXT: mulli r5, r5, 654 -; P8LE-NEXT: mulli r3, r3, 23 -; P8LE-NEXT: sub r4, r8, r4 -; P8LE-NEXT: sub r5, r6, r5 +; P8LE-NEXT: mtfprd f1, r4 +; P8LE-NEXT: lis r4, -16037 +; P8LE-NEXT: mulhdu r3, r5, r3 +; P8LE-NEXT: mfvsrd r5, v3 +; P8LE-NEXT: xxswapd v4, vs0 +; P8LE-NEXT: mtfprd f0, r3 +; P8LE-NEXT: ori r3, r4, 28749 +; P8LE-NEXT: lis r4, 25653 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: vsrd v4, v2, v4 +; P8LE-NEXT: ori r4, r4, 15432 +; P8LE-NEXT: xxmrghd v5, vs1, vs0 +; P8LE-NEXT: oris r3, r3, 52170 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: ori r3, r3, 12109 +; P8LE-NEXT: mfvsrd r6, v4 +; P8LE-NEXT: oris r4, r4, 1603 +; P8LE-NEXT: mulhdu r3, r5, r3 +; P8LE-NEXT: ori r4, r4, 21445 +; P8LE-NEXT: addis r5, r2, .LCPI6_2@toc@ha +; P8LE-NEXT: vsubudm v4, v3, v5 +; P8LE-NEXT: mulhdu r4, r6, r4 +; P8LE-NEXT: addi r5, r5, .LCPI6_2@toc@l +; P8LE-NEXT: xxswapd vs0, v4 +; P8LE-NEXT: mtfprd f1, r3 +; P8LE-NEXT: mffprd r3, f0 +; P8LE-NEXT: xxspltd v4, vs1, 0 +; P8LE-NEXT: lxvd2x vs1, 0, r5 +; P8LE-NEXT: addis r5, r2, .LCPI6_3@toc@ha ; P8LE-NEXT: mtfprd f0, r4 -; P8LE-NEXT: sub r3, r7, r3 -; P8LE-NEXT: li r4, 0 -; P8LE-NEXT: mtfprd f1, r5 -; P8LE-NEXT: mtfprd f2, r3 -; P8LE-NEXT: mtfprd f3, r4 -; P8LE-NEXT: xxmrghd v3, vs0, vs2 -; P8LE-NEXT: xxmrghd v2, vs1, vs3 +; P8LE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; P8LE-NEXT: rldicl r3, r3, 63, 1 +; P8LE-NEXT: addi r4, r4, .LCPI6_0@toc@l +; P8LE-NEXT: mtfprd f3, r3 +; P8LE-NEXT: xxspltd v0, vs0, 0 +; P8LE-NEXT: lxvd2x vs0, 0, r4 +; P8LE-NEXT: addi r3, r5, .LCPI6_3@toc@l +; P8LE-NEXT: xxmrgld v4, v4, v5 +; P8LE-NEXT: lxvd2x vs4, 0, r3 +; P8LE-NEXT: xxswapd v1, vs1 +; P8LE-NEXT: xxpermdi v5, vs2, vs3, 2 +; P8LE-NEXT: xxmrgld v0, v0, vs2 +; P8LE-NEXT: xxswapd v6, vs0 +; P8LE-NEXT: vaddudm v4, v5, v4 +; P8LE-NEXT: xxswapd v5, vs4 +; P8LE-NEXT: vsrd v0, v0, v1 +; P8LE-NEXT: vsrd v4, v4, v6 +; P8LE-NEXT: xxsel vs0, v0, v2, v5 +; P8LE-NEXT: xxswapd vs1, v4 +; P8LE-NEXT: mfvsrd r3, v4 +; P8LE-NEXT: mffprd r5, f0 +; P8LE-NEXT: xxswapd vs2, vs0 +; P8LE-NEXT: mulli r3, r3, 5423 +; P8LE-NEXT: mffprd r4, f1 +; P8LE-NEXT: mulli r5, r5, 654 +; P8LE-NEXT: mulli r4, r4, 23 +; P8LE-NEXT: mffprd r6, f2 +; P8LE-NEXT: mtfprd f1, r3 +; P8LE-NEXT: mtfprd f0, r6 +; P8LE-NEXT: mtfprd f3, r5 +; P8LE-NEXT: mtfprd f2, r4 +; P8LE-NEXT: xxmrghd v5, vs3, vs0 +; P8LE-NEXT: xxmrghd v4, vs1, vs2 +; P8LE-NEXT: vsubudm v2, v2, v5 +; P8LE-NEXT: vsubudm v3, v3, v4 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_i64: @@ -1152,46 +885,73 @@ ; P8BE-NEXT: lis r3, 25644 ; P8BE-NEXT: lis r4, -16037 ; P8BE-NEXT: xxswapd vs0, v3 -; P8BE-NEXT: xxswapd vs1, v2 -; P8BE-NEXT: lis r5, 25653 +; P8BE-NEXT: addis r5, r2, .LCPI6_1@toc@ha +; P8BE-NEXT: mfvsrd r6, v3 ; P8BE-NEXT: ori r3, r3, 34192 ; P8BE-NEXT: ori r4, r4, 28749 -; P8BE-NEXT: mfvsrd r6, v3 -; P8BE-NEXT: ori r5, r5, 15432 +; P8BE-NEXT: addi r5, r5, .LCPI6_1@toc@l ; P8BE-NEXT: sldi r3, r3, 32 ; P8BE-NEXT: sldi r4, r4, 32 +; P8BE-NEXT: lxvd2x v4, 0, r5 ; P8BE-NEXT: oris r3, r3, 45590 -; P8BE-NEXT: sldi r5, r5, 32 -; P8BE-NEXT: mffprd r7, f0 ; P8BE-NEXT: oris r4, r4, 52170 +; P8BE-NEXT: mffprd r7, f0 ; P8BE-NEXT: ori r3, r3, 17097 -; P8BE-NEXT: mffprd r8, f1 -; P8BE-NEXT: oris r5, r5, 1603 ; P8BE-NEXT: ori r4, r4, 12109 ; P8BE-NEXT: mulhdu r3, r6, r3 -; P8BE-NEXT: ori r5, r5, 21445 ; P8BE-NEXT: mulhdu r4, r7, r4 -; P8BE-NEXT: rldicl r9, r8, 63, 1 -; P8BE-NEXT: mulhdu r5, r9, r5 -; P8BE-NEXT: sub r9, r6, r3 -; P8BE-NEXT: rldicl r9, r9, 63, 1 -; P8BE-NEXT: rldicl r4, r4, 52, 12 -; P8BE-NEXT: add r3, r9, r3 -; P8BE-NEXT: mulli r4, r4, 5423 -; P8BE-NEXT: rldicl r5, r5, 57, 7 -; P8BE-NEXT: rldicl r3, r3, 60, 4 -; P8BE-NEXT: mulli r5, r5, 654 +; P8BE-NEXT: vsrd v4, v2, v4 +; P8BE-NEXT: xxswapd vs1, v4 +; P8BE-NEXT: mtfprd f0, r3 +; P8BE-NEXT: lis r3, 25653 +; P8BE-NEXT: mtfprd f2, r4 +; P8BE-NEXT: ori r3, r3, 15432 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: mffprd r4, f1 +; P8BE-NEXT: xxmrghd v4, vs0, vs2 +; P8BE-NEXT: oris r3, r3, 1603 +; P8BE-NEXT: ori r3, r3, 21445 +; P8BE-NEXT: mulhdu r3, r4, r3 +; P8BE-NEXT: vsubudm v5, v3, v4 +; P8BE-NEXT: mfvsrd r4, v5 +; P8BE-NEXT: mtfprd f0, r3 +; P8BE-NEXT: rldicl r3, r4, 63, 1 +; P8BE-NEXT: addis r4, r2, .LCPI6_3@toc@ha +; P8BE-NEXT: mtfprd f1, r3 +; P8BE-NEXT: xxspltd v5, vs0, 0 +; P8BE-NEXT: addis r3, r2, .LCPI6_2@toc@ha +; P8BE-NEXT: addi r4, r4, .LCPI6_3@toc@l +; P8BE-NEXT: xxlxor vs0, vs0, vs0 +; P8BE-NEXT: addi r3, r3, .LCPI6_2@toc@l +; P8BE-NEXT: lxvd2x v6, 0, r4 +; P8BE-NEXT: xxspltd v0, vs1, 0 +; P8BE-NEXT: lxvd2x v1, 0, r3 +; P8BE-NEXT: addis r3, r2, .LCPI6_0@toc@ha +; P8BE-NEXT: xxmrgld v5, vs0, v5 +; P8BE-NEXT: addi r3, r3, .LCPI6_0@toc@l +; P8BE-NEXT: xxmrgld v0, v0, vs0 +; P8BE-NEXT: vsrd v5, v5, v1 +; P8BE-NEXT: lxvd2x v1, 0, r3 +; P8BE-NEXT: vaddudm v4, v0, v4 +; P8BE-NEXT: xxsel vs0, v5, v2, v6 +; P8BE-NEXT: vsrd v4, v4, v1 +; P8BE-NEXT: xxswapd vs2, vs0 +; P8BE-NEXT: mffprd r6, f0 +; P8BE-NEXT: xxswapd vs1, v4 +; P8BE-NEXT: mfvsrd r3, v4 +; P8BE-NEXT: mtfprd f0, r6 +; P8BE-NEXT: mffprd r5, f2 ; P8BE-NEXT: mulli r3, r3, 23 -; P8BE-NEXT: sub r4, r7, r4 -; P8BE-NEXT: mtfprd f0, r4 -; P8BE-NEXT: sub r4, r8, r5 -; P8BE-NEXT: sub r3, r6, r3 -; P8BE-NEXT: mtfprd f1, r4 -; P8BE-NEXT: li r4, 0 -; P8BE-NEXT: mtfprd f2, r3 -; P8BE-NEXT: mtfprd f3, r4 -; P8BE-NEXT: xxmrghd v3, vs2, vs0 -; P8BE-NEXT: xxmrghd v2, vs3, vs1 +; P8BE-NEXT: mffprd r4, f1 +; P8BE-NEXT: mulli r5, r5, 654 +; P8BE-NEXT: mulli r4, r4, 5423 +; P8BE-NEXT: mtfprd f1, r3 +; P8BE-NEXT: mtfprd f3, r5 +; P8BE-NEXT: mtfprd f2, r4 +; P8BE-NEXT: xxmrghd v5, vs0, vs3 +; P8BE-NEXT: xxmrghd v4, vs1, vs2 +; P8BE-NEXT: vsubudm v2, v2, v5 +; P8BE-NEXT: vsubudm v3, v3, v4 ; P8BE-NEXT: blr %1 = urem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll --- a/llvm/test/CodeGen/RISCV/div.ll +++ b/llvm/test/CodeGen/RISCV/div.ll @@ -49,8 +49,12 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) -; RV32I-NEXT: addi a1, zero, 5 -; RV32I-NEXT: call __udivsi3 +; RV32I-NEXT: lui a1, 838861 +; RV32I-NEXT: addi a2, a1, -819 +; RV32I-NEXT: mv a1, zero +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: srli a0, a1, 2 ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -69,8 +73,12 @@ ; RV64I-NEXT: sd ra, 8(sp) ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 -; RV64I-NEXT: addi a1, zero, 5 -; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: lui a1, 205 +; RV64I-NEXT: addiw a1, a1, -819 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a1, a1, -819 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 34 ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -79,16 +87,12 @@ ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 32 ; RV64IM-NEXT: srli a0, a0, 32 -; RV64IM-NEXT: lui a1, 1035469 +; RV64IM-NEXT: lui a1, 205 ; RV64IM-NEXT: addiw a1, a1, -819 ; RV64IM-NEXT: slli a1, a1, 12 ; RV64IM-NEXT: addi a1, a1, -819 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -819 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -819 -; RV64IM-NEXT: mulhu a0, a0, a1 -; RV64IM-NEXT: srli a0, a0, 2 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: srli a0, a0, 34 ; RV64IM-NEXT: ret %1 = udiv i32 %a, 5 ret i32 %1 @@ -157,32 +161,78 @@ define i64 @udiv64_constant(i64 %a) nounwind { ; RV32I-LABEL: udiv64_constant: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) -; RV32I-NEXT: addi a2, zero, 5 -; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __udivdi3 -; RV32I-NEXT: lw ra, 12(sp) -; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: sw ra, 60(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw a1, 28(sp) +; RV32I-NEXT: sw a0, 24(sp) +; RV32I-NEXT: lui a0, 838861 +; RV32I-NEXT: addi a1, a0, -820 +; RV32I-NEXT: sw a1, 12(sp) +; RV32I-NEXT: addi a3, a0, -819 +; RV32I-NEXT: addi a0, sp, 40 +; RV32I-NEXT: addi a1, sp, 24 +; RV32I-NEXT: addi a2, sp, 8 +; RV32I-NEXT: sw a3, 8(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: lw a1, 52(sp) +; RV32I-NEXT: lw a0, 48(sp) +; RV32I-NEXT: slli a2, a1, 30 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a1, a1, 2 +; RV32I-NEXT: lw ra, 60(sp) +; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: udiv64_constant: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -16 -; RV32IM-NEXT: sw ra, 12(sp) -; RV32IM-NEXT: addi a2, zero, 5 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __udivdi3 -; RV32IM-NEXT: lw ra, 12(sp) -; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: addi sp, sp, -64 +; RV32IM-NEXT: sw ra, 60(sp) +; RV32IM-NEXT: sw zero, 20(sp) +; RV32IM-NEXT: sw zero, 16(sp) +; RV32IM-NEXT: sw zero, 36(sp) +; RV32IM-NEXT: sw zero, 32(sp) +; RV32IM-NEXT: sw a1, 28(sp) +; RV32IM-NEXT: sw a0, 24(sp) +; RV32IM-NEXT: lui a0, 838861 +; RV32IM-NEXT: addi a1, a0, -820 +; RV32IM-NEXT: sw a1, 12(sp) +; RV32IM-NEXT: addi a3, a0, -819 +; RV32IM-NEXT: addi a0, sp, 40 +; RV32IM-NEXT: addi a1, sp, 24 +; RV32IM-NEXT: addi a2, sp, 8 +; RV32IM-NEXT: sw a3, 8(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: lw a1, 52(sp) +; RV32IM-NEXT: lw a0, 48(sp) +; RV32IM-NEXT: slli a2, a1, 30 +; RV32IM-NEXT: srli a0, a0, 2 +; RV32IM-NEXT: or a0, a0, a2 +; RV32IM-NEXT: srli a1, a1, 2 +; RV32IM-NEXT: lw ra, 60(sp) +; RV32IM-NEXT: addi sp, sp, 64 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: udiv64_constant: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) -; RV64I-NEXT: addi a1, zero, 5 -; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: lui a1, 1035469 +; RV64I-NEXT: addiw a1, a1, -819 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a1, a1, -819 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a1, a1, -819 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a2, a1, -819 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: srli a0, a1, 2 ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -243,8 +293,14 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) -; RV32I-NEXT: addi a1, zero, 5 -; RV32I-NEXT: call __divsi3 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lui a2, 419430 +; RV32I-NEXT: addi a2, a2, 1639 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: srai a1, a1, 1 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -264,8 +320,12 @@ ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) ; RV64I-NEXT: sext.w a0, a0 -; RV64I-NEXT: addi a1, zero, 5 -; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: lui a1, 419430 +; RV64I-NEXT: addiw a1, a1, 1639 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a1, a0, 63 +; RV64I-NEXT: srai a0, a0, 33 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -273,17 +333,11 @@ ; RV64IM-LABEL: sdiv_constant: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 13107 -; RV64IM-NEXT: addiw a1, a1, 819 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 819 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 819 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, 1639 -; RV64IM-NEXT: mulh a0, a0, a1 +; RV64IM-NEXT: lui a1, 419430 +; RV64IM-NEXT: addiw a1, a1, 1639 +; RV64IM-NEXT: mul a0, a0, a1 ; RV64IM-NEXT: srli a1, a0, 63 -; RV64IM-NEXT: srai a0, a0, 1 +; RV64IM-NEXT: srai a0, a0, 33 ; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret %1 = sdiv i32 %a, 5 @@ -367,32 +421,90 @@ define i64 @sdiv64_constant(i64 %a) nounwind { ; RV32I-LABEL: sdiv64_constant: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) -; RV32I-NEXT: addi a2, zero, 5 -; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __divdi3 -; RV32I-NEXT: lw ra, 12(sp) -; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: sw ra, 60(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw a1, 28(sp) +; RV32I-NEXT: sw a0, 24(sp) +; RV32I-NEXT: lui a0, 419430 +; RV32I-NEXT: addi a2, a0, 1638 +; RV32I-NEXT: sw a2, 12(sp) +; RV32I-NEXT: addi a0, a0, 1639 +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: srai a3, a1, 31 +; RV32I-NEXT: sw a3, 36(sp) +; RV32I-NEXT: addi a0, sp, 40 +; RV32I-NEXT: addi a1, sp, 24 +; RV32I-NEXT: addi a2, sp, 8 +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: lw a1, 52(sp) +; RV32I-NEXT: lw a0, 48(sp) +; RV32I-NEXT: slli a2, a1, 31 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: or a2, a0, a2 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: srai a1, a1, 1 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: lw ra, 60(sp) +; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: sdiv64_constant: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -16 -; RV32IM-NEXT: sw ra, 12(sp) -; RV32IM-NEXT: addi a2, zero, 5 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __divdi3 -; RV32IM-NEXT: lw ra, 12(sp) -; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: addi sp, sp, -64 +; RV32IM-NEXT: sw ra, 60(sp) +; RV32IM-NEXT: sw zero, 20(sp) +; RV32IM-NEXT: sw zero, 16(sp) +; RV32IM-NEXT: sw a1, 28(sp) +; RV32IM-NEXT: sw a0, 24(sp) +; RV32IM-NEXT: lui a0, 419430 +; RV32IM-NEXT: addi a2, a0, 1638 +; RV32IM-NEXT: sw a2, 12(sp) +; RV32IM-NEXT: addi a0, a0, 1639 +; RV32IM-NEXT: sw a0, 8(sp) +; RV32IM-NEXT: srai a3, a1, 31 +; RV32IM-NEXT: sw a3, 36(sp) +; RV32IM-NEXT: addi a0, sp, 40 +; RV32IM-NEXT: addi a1, sp, 24 +; RV32IM-NEXT: addi a2, sp, 8 +; RV32IM-NEXT: sw a3, 32(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: lw a1, 52(sp) +; RV32IM-NEXT: lw a0, 48(sp) +; RV32IM-NEXT: slli a2, a1, 31 +; RV32IM-NEXT: srli a0, a0, 1 +; RV32IM-NEXT: or a2, a0, a2 +; RV32IM-NEXT: srli a0, a1, 31 +; RV32IM-NEXT: add a0, a2, a0 +; RV32IM-NEXT: sltu a2, a0, a2 +; RV32IM-NEXT: srai a1, a1, 1 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: lw ra, 60(sp) +; RV32IM-NEXT: addi sp, sp, 64 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: sdiv64_constant: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) -; RV64I-NEXT: addi a1, zero, 5 -; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: srai a1, a0, 63 +; RV64I-NEXT: lui a2, 13107 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: slli a2, a2, 12 +; RV64I-NEXT: addi a2, a2, 819 +; RV64I-NEXT: slli a2, a2, 12 +; RV64I-NEXT: addi a2, a2, 819 +; RV64I-NEXT: slli a2, a2, 13 +; RV64I-NEXT: addi a2, a2, 1639 +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: srli a0, a1, 63 +; RV64I-NEXT: srai a1, a1, 1 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/srem-lkk.ll b/llvm/test/CodeGen/RISCV/srem-lkk.ll --- a/llvm/test/CodeGen/RISCV/srem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-lkk.ll @@ -13,8 +13,22 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lui a0, 706409 +; RV32I-NEXT: addi a2, a0, 389 +; RV32I-NEXT: addi a3, zero, -1 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: add a0, a1, s0 +; RV32I-NEXT: srli a1, a0, 31 +; RV32I-NEXT: srai a0, a0, 6 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -37,32 +51,39 @@ ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: lui a1, 706409 +; RV64I-NEXT: addiw a1, a1, 389 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: addw a0, a0, s0 +; RV64I-NEXT: srliw a1, a0, 31 +; RV64I-NEXT: srli a0, a0, 6 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: subw a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: fold_srem_positive_odd: ; RV64IM: # %bb.0: -; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 1045903 -; RV64IM-NEXT: addiw a1, a1, -733 -; RV64IM-NEXT: slli a1, a1, 15 -; RV64IM-NEXT: addi a1, a1, 1035 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -905 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -1767 -; RV64IM-NEXT: mulh a1, a0, a1 -; RV64IM-NEXT: add a1, a1, a0 -; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 6 +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: lui a2, 706409 +; RV64IM-NEXT: addiw a2, a2, 389 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: addw a1, a1, a0 +; RV64IM-NEXT: srliw a2, a1, 31 +; RV64IM-NEXT: srli a1, a1, 6 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: addi a2, zero, 95 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = srem i32 %x, 95 ret i32 %1 @@ -74,8 +95,21 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lui a0, 253241 +; RV32I-NEXT: addi a2, a0, -15 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: srai a1, a1, 8 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: addi a1, zero, 1060 -; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -97,29 +131,35 @@ ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: lui a1, 253241 +; RV64I-NEXT: addiw a1, a1, -15 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a1, a0, 63 +; RV64I-NEXT: srai a0, a0, 40 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 1060 -; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: subw a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: fold_srem_positive_even: ; RV64IM: # %bb.0: -; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 506482 -; RV64IM-NEXT: addiw a1, a1, -31 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, 711 -; RV64IM-NEXT: slli a1, a1, 19 -; RV64IM-NEXT: addi a1, a1, 1979 -; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: lui a2, 253241 +; RV64IM-NEXT: addiw a2, a2, -15 +; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 9 +; RV64IM-NEXT: srai a1, a1, 40 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: addi a2, zero, 1060 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = srem i32 %x, 1060 ret i32 %1 @@ -131,8 +171,21 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lui a0, 677296 +; RV32I-NEXT: addi a2, a0, -91 +; RV32I-NEXT: addi a3, zero, -1 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: srai a1, a1, 8 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: addi a1, zero, -723 -; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -154,32 +207,35 @@ ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: lui a1, 677296 +; RV64I-NEXT: addiw a1, a1, -91 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a1, a0, 63 +; RV64I-NEXT: srai a0, a0, 40 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, -723 -; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: subw a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: fold_srem_negative_odd: ; RV64IM: # %bb.0: -; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 4781 -; RV64IM-NEXT: addiw a1, a1, 2045 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, 1371 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, -11 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -1355 -; RV64IM-NEXT: mulh a1, a0, a1 -; RV64IM-NEXT: sub a1, a1, a0 +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: lui a2, 677296 +; RV64IM-NEXT: addiw a2, a2, -91 +; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 9 +; RV64IM-NEXT: srai a1, a1, 40 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: addi a2, zero, -723 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = srem i32 %x, -723 ret i32 %1 @@ -191,9 +247,22 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lui a0, 1036895 +; RV32I-NEXT: addi a2, a0, 999 +; RV32I-NEXT: addi a3, zero, -1 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: srai a1, a1, 8 +; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: lui a1, 1048570 ; RV32I-NEXT: addi a1, a1, 1595 -; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -216,33 +285,37 @@ ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: lui a1, 1036895 +; RV64I-NEXT: addiw a1, a1, 999 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a1, a0, 63 +; RV64I-NEXT: srai a0, a0, 40 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 1048570 ; RV64I-NEXT: addiw a1, a1, 1595 -; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: subw a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: fold_srem_negative_even: ; RV64IM: # %bb.0: -; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 1036895 -; RV64IM-NEXT: addiw a1, a1, 999 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 11 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -523 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -481 -; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: lui a2, 1036895 +; RV64IM-NEXT: addiw a2, a2, 999 +; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 12 +; RV64IM-NEXT: srai a1, a1, 40 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: lui a2, 1048570 ; RV64IM-NEXT: addiw a2, a2, 1595 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = srem i32 %x, -22981 ret i32 %1 @@ -258,13 +331,21 @@ ; RV32I-NEXT: sw s0, 8(sp) ; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: lui a0, 706409 +; RV32I-NEXT: addi a2, a0, 389 +; RV32I-NEXT: addi a3, zero, -1 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __divsi3 -; RV32I-NEXT: add a0, s1, a0 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: add a0, a1, s0 +; RV32I-NEXT: srli a1, a0, 31 +; RV32I-NEXT: srai a0, a0, 6 +; RV32I-NEXT: add s1, a0, a1 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: add a0, a0, s1 ; RV32I-NEXT: lw s1, 4(sp) ; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) @@ -292,15 +373,22 @@ ; RV64I-NEXT: sd ra, 24(sp) ; RV64I-NEXT: sd s0, 16(sp) ; RV64I-NEXT: sd s1, 8(sp) -; RV64I-NEXT: sext.w s0, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: lui a1, 706409 +; RV64I-NEXT: addiw a1, a1, 389 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: addw a1, a0, s0 +; RV64I-NEXT: srliw a1, a1, 31 +; RV64I-NEXT: add a0, a0, s0 +; RV64I-NEXT: sraiw a0, a0, 6 +; RV64I-NEXT: add s1, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __divdi3 -; RV64I-NEXT: addw a0, s1, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: addw a0, a0, s1 ; RV64I-NEXT: ld s1, 8(sp) ; RV64I-NEXT: ld s0, 16(sp) ; RV64I-NEXT: ld ra, 24(sp) @@ -310,18 +398,14 @@ ; RV64IM-LABEL: combine_srem_sdiv: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a1, a0 -; RV64IM-NEXT: lui a2, 1045903 -; RV64IM-NEXT: addiw a2, a2, -733 -; RV64IM-NEXT: slli a2, a2, 15 -; RV64IM-NEXT: addi a2, a2, 1035 -; RV64IM-NEXT: slli a2, a2, 12 -; RV64IM-NEXT: addi a2, a2, -905 -; RV64IM-NEXT: slli a2, a2, 12 -; RV64IM-NEXT: addi a2, a2, -1767 -; RV64IM-NEXT: mulh a2, a1, a2 -; RV64IM-NEXT: add a1, a2, a1 -; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 6 +; RV64IM-NEXT: lui a2, 706409 +; RV64IM-NEXT: addiw a2, a2, 389 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: addw a2, a1, a0 +; RV64IM-NEXT: srliw a2, a2, 31 +; RV64IM-NEXT: add a1, a1, a0 +; RV64IM-NEXT: sraiw a1, a1, 6 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: addi a2, zero, 95 ; RV64IM-NEXT: mul a2, a1, a2 @@ -450,32 +534,127 @@ define i64 @dont_fold_srem_i64(i64 %x) nounwind { ; RV32I-LABEL: dont_fold_srem_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: sw ra, 60(sp) +; RV32I-NEXT: sw s0, 56(sp) +; RV32I-NEXT: sw s1, 52(sp) +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw a1, 20(sp) +; RV32I-NEXT: sw a0, 16(sp) +; RV32I-NEXT: lui a0, 342392 +; RV32I-NEXT: addi a0, a0, 668 +; RV32I-NEXT: sw a0, 4(sp) +; RV32I-NEXT: lui a0, 770382 +; RV32I-NEXT: addi a0, a0, 1505 +; RV32I-NEXT: sw a0, 0(sp) +; RV32I-NEXT: srai a3, a1, 31 +; RV32I-NEXT: sw a3, 28(sp) +; RV32I-NEXT: addi a0, sp, 32 +; RV32I-NEXT: addi a1, sp, 16 +; RV32I-NEXT: mv a2, sp +; RV32I-NEXT: sw a3, 24(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: lw a1, 44(sp) +; RV32I-NEXT: lw a0, 40(sp) +; RV32I-NEXT: slli a2, a1, 27 +; RV32I-NEXT: srli a0, a0, 5 +; RV32I-NEXT: or a2, a0, a2 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: srai a1, a1, 5 +; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: addi a2, zero, 98 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __moddi3 -; RV32I-NEXT: lw ra, 12(sp) -; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sltu a2, s1, a0 +; RV32I-NEXT: sub a1, s0, a1 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a0, s1, a0 +; RV32I-NEXT: lw s1, 52(sp) +; RV32I-NEXT: lw s0, 56(sp) +; RV32I-NEXT: lw ra, 60(sp) +; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: dont_fold_srem_i64: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -16 -; RV32IM-NEXT: sw ra, 12(sp) -; RV32IM-NEXT: addi a2, zero, 98 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __moddi3 -; RV32IM-NEXT: lw ra, 12(sp) -; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: addi sp, sp, -64 +; RV32IM-NEXT: sw ra, 60(sp) +; RV32IM-NEXT: sw s0, 56(sp) +; RV32IM-NEXT: sw s1, 52(sp) +; RV32IM-NEXT: mv s0, a1 +; RV32IM-NEXT: mv s1, a0 +; RV32IM-NEXT: sw zero, 12(sp) +; RV32IM-NEXT: sw zero, 8(sp) +; RV32IM-NEXT: sw a1, 20(sp) +; RV32IM-NEXT: sw a0, 16(sp) +; RV32IM-NEXT: lui a0, 342392 +; RV32IM-NEXT: addi a0, a0, 668 +; RV32IM-NEXT: sw a0, 4(sp) +; RV32IM-NEXT: lui a0, 770382 +; RV32IM-NEXT: addi a0, a0, 1505 +; RV32IM-NEXT: sw a0, 0(sp) +; RV32IM-NEXT: srai a3, a1, 31 +; RV32IM-NEXT: sw a3, 28(sp) +; RV32IM-NEXT: addi a0, sp, 32 +; RV32IM-NEXT: addi a1, sp, 16 +; RV32IM-NEXT: mv a2, sp +; RV32IM-NEXT: sw a3, 24(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: lw a0, 44(sp) +; RV32IM-NEXT: lw a1, 40(sp) +; RV32IM-NEXT: slli a2, a0, 27 +; RV32IM-NEXT: srli a1, a1, 5 +; RV32IM-NEXT: or a1, a1, a2 +; RV32IM-NEXT: srli a2, a0, 31 +; RV32IM-NEXT: add a2, a1, a2 +; RV32IM-NEXT: sltu a1, a2, a1 +; RV32IM-NEXT: srai a0, a0, 5 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: addi a1, zero, 98 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: mulhu a3, a2, a1 +; RV32IM-NEXT: add a0, a3, a0 +; RV32IM-NEXT: sub a0, s0, a0 +; RV32IM-NEXT: mul a2, a2, a1 +; RV32IM-NEXT: sltu a1, s1, a2 +; RV32IM-NEXT: sub a1, a0, a1 +; RV32IM-NEXT: sub a0, s1, a2 +; RV32IM-NEXT: lw s1, 52(sp) +; RV32IM-NEXT: lw s0, 56(sp) +; RV32IM-NEXT: lw ra, 60(sp) +; RV32IM-NEXT: addi sp, sp, 64 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_srem_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: srai a1, a0, 63 +; RV64I-NEXT: lui a0, 2675 +; RV64I-NEXT: addiw a0, a0, -251 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, 1839 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, 167 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a2, a0, 1505 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: srli a0, a1, 63 +; RV64I-NEXT: srai a1, a1, 5 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: addi a1, zero, 98 -; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -19,30 +19,66 @@ ; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: sw s5, 4(sp) -; RV32I-NEXT: lh s2, 12(a1) -; RV32I-NEXT: lh s3, 8(a1) -; RV32I-NEXT: lh s0, 4(a1) -; RV32I-NEXT: lh a2, 0(a1) -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lh s0, 0(a1) +; RV32I-NEXT: lui a0, 1048571 +; RV32I-NEXT: addi a1, a0, -905 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s0 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s3, s0, a0 +; RV32I-NEXT: lh s0, 12(s1) +; RV32I-NEXT: lui a0, 8 +; RV32I-NEXT: addi a1, a0, -687 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: sub a0, a0, s0 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 25 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: addi a1, zero, -1003 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s4, s0, a0 +; RV32I-NEXT: lh s0, 4(s1) +; RV32I-NEXT: lui a0, 1048572 +; RV32I-NEXT: addi a1, a0, -529 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srai a0, a0, 21 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, -124 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s5, s0, a0 +; RV32I-NEXT: lh s0, 8(s1) +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a1, a0, -1421 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srai a0, a0, 18 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 98 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, -1003 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: sh a0, 6(s1) -; RV32I-NEXT: sh s0, 4(s1) -; RV32I-NEXT: sh s5, 2(s1) -; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: sh a0, 4(s2) +; RV32I-NEXT: sh s5, 2(s2) +; RV32I-NEXT: sh s4, 6(s2) +; RV32I-NEXT: sh s3, 0(s2) ; RV32I-NEXT: lw s5, 4(sp) ; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) @@ -55,52 +91,60 @@ ; ; RV32IM-LABEL: fold_srem_vec_1: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a6, 12(a1) -; RV32IM-NEXT: lh a3, 8(a1) -; RV32IM-NEXT: lh a4, 0(a1) -; RV32IM-NEXT: lh a1, 4(a1) -; RV32IM-NEXT: lui a5, 706409 -; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a5, a4, a5 -; RV32IM-NEXT: add a5, a5, a4 -; RV32IM-NEXT: srli a2, a5, 31 -; RV32IM-NEXT: srli a5, a5, 6 -; RV32IM-NEXT: add a2, a5, a2 -; RV32IM-NEXT: addi a5, zero, 95 -; RV32IM-NEXT: mul a2, a2, a5 -; RV32IM-NEXT: sub a2, a4, a2 -; RV32IM-NEXT: lui a4, 507375 -; RV32IM-NEXT: addi a4, a4, 1981 -; RV32IM-NEXT: mulh a4, a1, a4 -; RV32IM-NEXT: sub a4, a4, a1 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 6 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: addi a5, zero, -124 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 -; RV32IM-NEXT: lui a4, 342392 -; RV32IM-NEXT: addi a4, a4, 669 -; RV32IM-NEXT: mulh a4, a3, a4 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 5 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: addi a5, zero, 98 -; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: lh a2, 0(a1) +; RV32IM-NEXT: lui a3, 1048571 +; RV32IM-NEXT: addi a3, a3, -905 +; RV32IM-NEXT: mul a3, a2, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a2 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 22 +; RV32IM-NEXT: srli a4, a3, 15 +; RV32IM-NEXT: andi a4, a4, 1 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: addi a4, zero, 95 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: lh a4, 12(a1) +; RV32IM-NEXT: sub a6, a2, a3 +; RV32IM-NEXT: lui a3, 8 +; RV32IM-NEXT: addi a3, a3, -687 +; RV32IM-NEXT: mul a3, a4, a3 +; RV32IM-NEXT: srli a3, a3, 16 ; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: lui a4, 780943 -; RV32IM-NEXT: addi a4, a4, 1809 -; RV32IM-NEXT: mulh a4, a6, a4 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 8 -; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 25 +; RV32IM-NEXT: srli a5, a3, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a3, a3, a5 ; RV32IM-NEXT: addi a5, zero, -1003 +; RV32IM-NEXT: mul a3, a3, a5 +; RV32IM-NEXT: lh a5, 4(a1) +; RV32IM-NEXT: sub a3, a4, a3 +; RV32IM-NEXT: lui a4, 1048572 +; RV32IM-NEXT: addi a4, a4, -529 +; RV32IM-NEXT: mul a4, a5, a4 +; RV32IM-NEXT: srai a4, a4, 21 +; RV32IM-NEXT: srli a2, a4, 15 +; RV32IM-NEXT: andi a2, a2, 1 +; RV32IM-NEXT: add a2, a4, a2 +; RV32IM-NEXT: addi a4, zero, -124 +; RV32IM-NEXT: mul a2, a2, a4 +; RV32IM-NEXT: lh a1, 8(a1) +; RV32IM-NEXT: sub a2, a5, a2 +; RV32IM-NEXT: lui a4, 1 +; RV32IM-NEXT: addi a4, a4, -1421 +; RV32IM-NEXT: mul a4, a1, a4 +; RV32IM-NEXT: srai a4, a4, 18 +; RV32IM-NEXT: srli a5, a4, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: addi a5, zero, 98 ; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a4, a6, a4 -; RV32IM-NEXT: sh a4, 6(a0) -; RV32IM-NEXT: sh a3, 4(a0) -; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: sh a1, 4(a0) +; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: sh a6, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_srem_vec_1: @@ -113,30 +157,66 @@ ; RV64I-NEXT: sd s3, 24(sp) ; RV64I-NEXT: sd s4, 16(sp) ; RV64I-NEXT: sd s5, 8(sp) -; RV64I-NEXT: lh s2, 24(a1) -; RV64I-NEXT: lh s3, 16(a1) -; RV64I-NEXT: lh s0, 8(a1) -; RV64I-NEXT: lh a2, 0(a1) -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: lh s0, 0(a1) +; RV64I-NEXT: lui a0, 1048571 +; RV64I-NEXT: addiw a1, a0, -905 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s0 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s3, s0, a0 +; RV64I-NEXT: lh s0, 24(s1) +; RV64I-NEXT: lui a0, 8 +; RV64I-NEXT: addiw a1, a0, -687 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: sub a0, a0, s0 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 57 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: addi a1, zero, -1003 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s0, a0 +; RV64I-NEXT: lh s0, 8(s1) +; RV64I-NEXT: lui a0, 1048572 +; RV64I-NEXT: addiw a1, a0, -529 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srai a0, a0, 21 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, -124 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s5, s0, a0 +; RV64I-NEXT: lh s0, 16(s1) +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a1, a0, -1421 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srai a0, a0, 18 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 98 -; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, -1003 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sh a0, 6(s1) -; RV64I-NEXT: sh s0, 4(s1) -; RV64I-NEXT: sh s5, 2(s1) -; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: sh a0, 4(s2) +; RV64I-NEXT: sh s5, 2(s2) +; RV64I-NEXT: sh s4, 6(s2) +; RV64I-NEXT: sh s3, 0(s2) ; RV64I-NEXT: ld s5, 8(sp) ; RV64I-NEXT: ld s4, 16(sp) ; RV64I-NEXT: ld s3, 24(sp) @@ -149,76 +229,60 @@ ; ; RV64IM-LABEL: fold_srem_vec_1: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a6, 24(a1) -; RV64IM-NEXT: lh a3, 16(a1) -; RV64IM-NEXT: lh a4, 8(a1) -; RV64IM-NEXT: lh a1, 0(a1) -; RV64IM-NEXT: lui a5, 1045903 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -905 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -1767 -; RV64IM-NEXT: mulh a5, a1, a5 -; RV64IM-NEXT: add a5, a5, a1 -; RV64IM-NEXT: srli a2, a5, 63 -; RV64IM-NEXT: srli a5, a5, 6 -; RV64IM-NEXT: add a2, a5, a2 -; RV64IM-NEXT: addi a5, zero, 95 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a1, a1, a2 -; RV64IM-NEXT: lui a2, 248 -; RV64IM-NEXT: addiw a2, a2, -1057 -; RV64IM-NEXT: slli a2, a2, 15 -; RV64IM-NEXT: addi a2, a2, -1057 -; RV64IM-NEXT: slli a2, a2, 15 -; RV64IM-NEXT: addi a2, a2, -1057 -; RV64IM-NEXT: slli a2, a2, 13 -; RV64IM-NEXT: addi a2, a2, -265 -; RV64IM-NEXT: mulh a2, a4, a2 -; RV64IM-NEXT: sub a2, a2, a4 -; RV64IM-NEXT: srli a5, a2, 63 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: addi a5, zero, -124 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a2, a4, a2 -; RV64IM-NEXT: lui a4, 2675 -; RV64IM-NEXT: addiw a4, a4, -251 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1839 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 167 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1505 -; RV64IM-NEXT: mulh a4, a3, a4 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 5 -; RV64IM-NEXT: add a4, a4, a5 -; RV64IM-NEXT: addi a5, zero, 98 -; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: lh a2, 0(a1) +; RV64IM-NEXT: lui a3, 1048571 +; RV64IM-NEXT: addiw a3, a3, -905 +; RV64IM-NEXT: mul a3, a2, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a2 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 54 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: addi a4, zero, 95 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: lh a4, 24(a1) +; RV64IM-NEXT: sub a6, a2, a3 +; RV64IM-NEXT: lui a3, 8 +; RV64IM-NEXT: addiw a3, a3, -687 +; RV64IM-NEXT: mul a3, a4, a3 +; RV64IM-NEXT: srli a3, a3, 16 ; RV64IM-NEXT: sub a3, a3, a4 -; RV64IM-NEXT: lui a4, 1040212 -; RV64IM-NEXT: addiw a4, a4, 1977 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -1907 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -453 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -1213 -; RV64IM-NEXT: mulh a4, a6, a4 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 7 -; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 57 +; RV64IM-NEXT: srli a5, a3, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a3, a3, a5 ; RV64IM-NEXT: addi a5, zero, -1003 +; RV64IM-NEXT: mul a3, a3, a5 +; RV64IM-NEXT: lh a5, 8(a1) +; RV64IM-NEXT: sub a3, a4, a3 +; RV64IM-NEXT: lui a4, 1048572 +; RV64IM-NEXT: addiw a4, a4, -529 +; RV64IM-NEXT: mul a4, a5, a4 +; RV64IM-NEXT: srai a4, a4, 21 +; RV64IM-NEXT: srli a2, a4, 15 +; RV64IM-NEXT: andi a2, a2, 1 +; RV64IM-NEXT: add a2, a4, a2 +; RV64IM-NEXT: addi a4, zero, -124 +; RV64IM-NEXT: mul a2, a2, a4 +; RV64IM-NEXT: lh a1, 16(a1) +; RV64IM-NEXT: sub a2, a5, a2 +; RV64IM-NEXT: lui a4, 1 +; RV64IM-NEXT: addiw a4, a4, -1421 +; RV64IM-NEXT: mul a4, a1, a4 +; RV64IM-NEXT: srai a4, a4, 18 +; RV64IM-NEXT: srli a5, a4, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, 98 ; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a4, a6, a4 -; RV64IM-NEXT: sh a4, 6(a0) -; RV64IM-NEXT: sh a3, 4(a0) +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: sh a1, 4(a0) ; RV64IM-NEXT: sh a2, 2(a0) -; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: sh a6, 0(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -235,30 +299,72 @@ ; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: sw s5, 4(sp) -; RV32I-NEXT: lh s2, 12(a1) -; RV32I-NEXT: lh s3, 8(a1) -; RV32I-NEXT: lh s0, 4(a1) -; RV32I-NEXT: lh a2, 0(a1) -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: sw s6, 0(sp) +; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lh s1, 0(a1) +; RV32I-NEXT: lui a0, 1048571 +; RV32I-NEXT: addi s0, a0, -905 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s4, s1, a0 +; RV32I-NEXT: lh s1, 4(s3) +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s5, s1, a0 +; RV32I-NEXT: lh s1, 8(s3) +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s6, s1, a0 +; RV32I-NEXT: lh s1, 12(s3) +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: sh a0, 6(s1) -; RV32I-NEXT: sh s0, 4(s1) -; RV32I-NEXT: sh s5, 2(s1) -; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s1, a0 +; RV32I-NEXT: sh a0, 6(s2) +; RV32I-NEXT: sh s6, 4(s2) +; RV32I-NEXT: sh s5, 2(s2) +; RV32I-NEXT: sh s4, 0(s2) +; RV32I-NEXT: lw s6, 0(sp) ; RV32I-NEXT: lw s5, 4(sp) ; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) @@ -271,45 +377,57 @@ ; ; RV32IM-LABEL: fold_srem_vec_2: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a6, 12(a1) -; RV32IM-NEXT: lh a3, 8(a1) -; RV32IM-NEXT: lh a4, 0(a1) -; RV32IM-NEXT: lh a1, 4(a1) -; RV32IM-NEXT: lui a5, 706409 -; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a2, a4, a5 -; RV32IM-NEXT: add a2, a2, a4 -; RV32IM-NEXT: srli a7, a2, 31 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: add a2, a2, a7 -; RV32IM-NEXT: addi a7, zero, 95 -; RV32IM-NEXT: mul a2, a2, a7 -; RV32IM-NEXT: sub t0, a4, a2 -; RV32IM-NEXT: mulh a4, a1, a5 -; RV32IM-NEXT: add a4, a4, a1 -; RV32IM-NEXT: srli a2, a4, 31 -; RV32IM-NEXT: srli a4, a4, 6 -; RV32IM-NEXT: add a2, a4, a2 -; RV32IM-NEXT: mul a2, a2, a7 -; RV32IM-NEXT: sub a1, a1, a2 -; RV32IM-NEXT: mulh a2, a3, a5 -; RV32IM-NEXT: add a2, a2, a3 -; RV32IM-NEXT: srli a4, a2, 31 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: add a2, a2, a4 -; RV32IM-NEXT: mul a2, a2, a7 -; RV32IM-NEXT: sub a2, a3, a2 -; RV32IM-NEXT: mulh a3, a6, a5 -; RV32IM-NEXT: add a3, a3, a6 -; RV32IM-NEXT: srli a4, a3, 31 -; RV32IM-NEXT: srli a3, a3, 6 +; RV32IM-NEXT: lh a2, 0(a1) +; RV32IM-NEXT: lui a3, 1048571 +; RV32IM-NEXT: addi a3, a3, -905 +; RV32IM-NEXT: mul a4, a2, a3 +; RV32IM-NEXT: srli a4, a4, 16 +; RV32IM-NEXT: add a4, a4, a2 +; RV32IM-NEXT: slli a4, a4, 16 +; RV32IM-NEXT: srai a4, a4, 22 +; RV32IM-NEXT: srli a5, a4, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: lh a5, 4(a1) +; RV32IM-NEXT: addi a6, zero, 95 +; RV32IM-NEXT: mul a4, a4, a6 +; RV32IM-NEXT: sub a7, a2, a4 +; RV32IM-NEXT: mul a4, a5, a3 +; RV32IM-NEXT: srli a4, a4, 16 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: slli a4, a4, 16 +; RV32IM-NEXT: srai a4, a4, 22 +; RV32IM-NEXT: srli a2, a4, 15 +; RV32IM-NEXT: andi t0, a2, 1 +; RV32IM-NEXT: lh a2, 8(a1) +; RV32IM-NEXT: add a4, a4, t0 +; RV32IM-NEXT: mul a4, a4, a6 +; RV32IM-NEXT: sub t0, a5, a4 +; RV32IM-NEXT: mul a5, a2, a3 +; RV32IM-NEXT: srli a5, a5, 16 +; RV32IM-NEXT: add a5, a5, a2 +; RV32IM-NEXT: slli a5, a5, 16 +; RV32IM-NEXT: srai a5, a5, 22 +; RV32IM-NEXT: srli a4, a5, 15 +; RV32IM-NEXT: andi a4, a4, 1 +; RV32IM-NEXT: lh a1, 12(a1) +; RV32IM-NEXT: add a4, a5, a4 +; RV32IM-NEXT: mul a4, a4, a6 +; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: mul a3, a1, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a1 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 22 +; RV32IM-NEXT: srli a4, a3, 15 +; RV32IM-NEXT: andi a4, a4, 1 ; RV32IM-NEXT: add a3, a3, a4 -; RV32IM-NEXT: mul a3, a3, a7 -; RV32IM-NEXT: sub a3, a6, a3 -; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: mul a3, a3, a6 +; RV32IM-NEXT: sub a1, a1, a3 +; RV32IM-NEXT: sh a1, 6(a0) ; RV32IM-NEXT: sh a2, 4(a0) -; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh t0, 0(a0) +; RV32IM-NEXT: sh t0, 2(a0) +; RV32IM-NEXT: sh a7, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_srem_vec_2: @@ -322,30 +440,72 @@ ; RV64I-NEXT: sd s3, 24(sp) ; RV64I-NEXT: sd s4, 16(sp) ; RV64I-NEXT: sd s5, 8(sp) -; RV64I-NEXT: lh s2, 24(a1) -; RV64I-NEXT: lh s3, 16(a1) -; RV64I-NEXT: lh s0, 8(a1) -; RV64I-NEXT: lh a2, 0(a1) -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: sd s6, 0(sp) +; RV64I-NEXT: mv s3, a1 +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: lh s1, 0(a1) +; RV64I-NEXT: lui a0, 1048571 +; RV64I-NEXT: addiw s0, a0, -905 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s1, a0 +; RV64I-NEXT: lh s1, 8(s3) +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s5, s1, a0 +; RV64I-NEXT: lh s1, 16(s3) +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s6, s1, a0 +; RV64I-NEXT: lh s1, 24(s3) +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sh a0, 6(s1) -; RV64I-NEXT: sh s0, 4(s1) -; RV64I-NEXT: sh s5, 2(s1) -; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s1, a0 +; RV64I-NEXT: sh a0, 6(s2) +; RV64I-NEXT: sh s6, 4(s2) +; RV64I-NEXT: sh s5, 2(s2) +; RV64I-NEXT: sh s4, 0(s2) +; RV64I-NEXT: ld s6, 0(sp) ; RV64I-NEXT: ld s5, 8(sp) ; RV64I-NEXT: ld s4, 16(sp) ; RV64I-NEXT: ld s3, 24(sp) @@ -358,51 +518,57 @@ ; ; RV64IM-LABEL: fold_srem_vec_2: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a6, 24(a1) -; RV64IM-NEXT: lh a7, 16(a1) -; RV64IM-NEXT: lh a4, 8(a1) -; RV64IM-NEXT: lh a1, 0(a1) -; RV64IM-NEXT: lui a5, 1045903 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -905 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -1767 -; RV64IM-NEXT: mulh a2, a1, a5 -; RV64IM-NEXT: add a2, a2, a1 -; RV64IM-NEXT: srli a3, a2, 63 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a2, a2, a3 -; RV64IM-NEXT: addi a3, zero, 95 -; RV64IM-NEXT: mul a2, a2, a3 -; RV64IM-NEXT: sub t0, a1, a2 -; RV64IM-NEXT: mulh a2, a4, a5 -; RV64IM-NEXT: add a2, a2, a4 -; RV64IM-NEXT: srli a1, a2, 63 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a1, a2, a1 -; RV64IM-NEXT: mul a1, a1, a3 -; RV64IM-NEXT: sub a1, a4, a1 -; RV64IM-NEXT: mulh a2, a7, a5 -; RV64IM-NEXT: add a2, a2, a7 -; RV64IM-NEXT: srli a4, a2, 63 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a2, a2, a4 -; RV64IM-NEXT: mul a2, a2, a3 -; RV64IM-NEXT: sub a2, a7, a2 -; RV64IM-NEXT: mulh a4, a6, a5 -; RV64IM-NEXT: add a4, a4, a6 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 6 +; RV64IM-NEXT: lh a2, 0(a1) +; RV64IM-NEXT: lui a3, 1048571 +; RV64IM-NEXT: addiw a3, a3, -905 +; RV64IM-NEXT: mul a4, a2, a3 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: add a4, a4, a2 +; RV64IM-NEXT: slli a4, a4, 48 +; RV64IM-NEXT: srai a4, a4, 54 +; RV64IM-NEXT: srli a5, a4, 15 +; RV64IM-NEXT: andi a5, a5, 1 ; RV64IM-NEXT: add a4, a4, a5 -; RV64IM-NEXT: mul a3, a4, a3 -; RV64IM-NEXT: sub a3, a6, a3 -; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: lh a5, 8(a1) +; RV64IM-NEXT: addi a6, zero, 95 +; RV64IM-NEXT: mul a4, a4, a6 +; RV64IM-NEXT: sub a7, a2, a4 +; RV64IM-NEXT: mul a4, a5, a3 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: slli a4, a4, 48 +; RV64IM-NEXT: srai a4, a4, 54 +; RV64IM-NEXT: srli a2, a4, 15 +; RV64IM-NEXT: andi t0, a2, 1 +; RV64IM-NEXT: lh a2, 16(a1) +; RV64IM-NEXT: add a4, a4, t0 +; RV64IM-NEXT: mul a4, a4, a6 +; RV64IM-NEXT: sub t0, a5, a4 +; RV64IM-NEXT: mul a5, a2, a3 +; RV64IM-NEXT: srli a5, a5, 16 +; RV64IM-NEXT: add a5, a5, a2 +; RV64IM-NEXT: slli a5, a5, 48 +; RV64IM-NEXT: srai a5, a5, 54 +; RV64IM-NEXT: srli a4, a5, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: mul a4, a4, a6 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: mul a3, a1, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a1 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 54 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: mul a3, a3, a6 +; RV64IM-NEXT: sub a1, a1, a3 +; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: sh a2, 4(a0) -; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh t0, 0(a0) +; RV64IM-NEXT: sh t0, 2(a0) +; RV64IM-NEXT: sh a7, 0(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -425,50 +591,78 @@ ; RV32I-NEXT: sw s7, 12(sp) ; RV32I-NEXT: sw s8, 8(sp) ; RV32I-NEXT: sw s9, 4(sp) -; RV32I-NEXT: lh s2, 0(a1) -; RV32I-NEXT: lh s3, 4(a1) -; RV32I-NEXT: lh s4, 8(a1) +; RV32I-NEXT: mv s4, a1 +; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: lh s1, 12(a1) -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: lui a0, 1048571 +; RV32I-NEXT: addi s0, a0, -905 ; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s5, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s6, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s7, a0 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add s2, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s8, a0 -; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s7, s1, a0 +; RV32I-NEXT: lh s1, 8(s4) ; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: call __divsi3 -; RV32I-NEXT: mv s9, a0 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add s5, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: call __divsi3 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv a0, s5 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s8, s1, a0 +; RV32I-NEXT: lh s1, 4(s4) +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add s6, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __divsi3 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv a0, s6 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s9, s1, a0 +; RV32I-NEXT: lh s1, 0(s4) +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add s0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __divsi3 -; RV32I-NEXT: add a0, s8, a0 -; RV32I-NEXT: add a1, s7, s1 -; RV32I-NEXT: add a2, s6, s4 -; RV32I-NEXT: add a3, s5, s9 -; RV32I-NEXT: sh a3, 6(s0) -; RV32I-NEXT: sh a2, 4(s0) -; RV32I-NEXT: sh a1, 2(s0) -; RV32I-NEXT: sh a0, 0(s0) +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s1, a0 +; RV32I-NEXT: add a0, a0, s0 +; RV32I-NEXT: add a1, s9, s6 +; RV32I-NEXT: add a2, s8, s5 +; RV32I-NEXT: add a3, s7, s2 +; RV32I-NEXT: sh a3, 6(s3) +; RV32I-NEXT: sh a2, 4(s3) +; RV32I-NEXT: sh a1, 2(s3) +; RV32I-NEXT: sh a0, 0(s3) ; RV32I-NEXT: lw s9, 4(sp) ; RV32I-NEXT: lw s8, 8(sp) ; RV32I-NEXT: lw s7, 12(sp) @@ -485,49 +679,61 @@ ; ; RV32IM-LABEL: combine_srem_sdiv: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a6, 0(a1) -; RV32IM-NEXT: lh a3, 4(a1) -; RV32IM-NEXT: lh a4, 12(a1) -; RV32IM-NEXT: lh a1, 8(a1) -; RV32IM-NEXT: lui a5, 706409 -; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a2, a4, a5 -; RV32IM-NEXT: add a2, a2, a4 -; RV32IM-NEXT: srli a7, a2, 31 -; RV32IM-NEXT: srai a2, a2, 6 -; RV32IM-NEXT: add t0, a2, a7 -; RV32IM-NEXT: addi a7, zero, 95 -; RV32IM-NEXT: mul a2, t0, a7 -; RV32IM-NEXT: sub t1, a4, a2 -; RV32IM-NEXT: mulh a4, a1, a5 -; RV32IM-NEXT: add a4, a4, a1 -; RV32IM-NEXT: srli a2, a4, 31 -; RV32IM-NEXT: srai a4, a4, 6 -; RV32IM-NEXT: add a2, a4, a2 -; RV32IM-NEXT: mul a4, a2, a7 -; RV32IM-NEXT: sub t2, a1, a4 -; RV32IM-NEXT: mulh a4, a3, a5 -; RV32IM-NEXT: add a4, a4, a3 -; RV32IM-NEXT: srli a1, a4, 31 -; RV32IM-NEXT: srai a4, a4, 6 -; RV32IM-NEXT: add a1, a4, a1 -; RV32IM-NEXT: mul a4, a1, a7 -; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: mulh a4, a6, a5 -; RV32IM-NEXT: add a4, a4, a6 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srai a4, a4, 6 +; RV32IM-NEXT: lh a2, 12(a1) +; RV32IM-NEXT: lui a3, 1048571 +; RV32IM-NEXT: addi a3, a3, -905 +; RV32IM-NEXT: mul a4, a2, a3 +; RV32IM-NEXT: srli a4, a4, 16 +; RV32IM-NEXT: add a4, a4, a2 +; RV32IM-NEXT: slli a4, a4, 16 +; RV32IM-NEXT: srai a4, a4, 22 +; RV32IM-NEXT: srli a5, a4, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a7, a4, a5 +; RV32IM-NEXT: lh a5, 8(a1) +; RV32IM-NEXT: addi a6, zero, 95 +; RV32IM-NEXT: mul a4, a7, a6 +; RV32IM-NEXT: sub t0, a2, a4 +; RV32IM-NEXT: mul a4, a5, a3 +; RV32IM-NEXT: srli a4, a4, 16 ; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: mul a5, a4, a7 -; RV32IM-NEXT: sub a5, a6, a5 +; RV32IM-NEXT: slli a4, a4, 16 +; RV32IM-NEXT: srai a4, a4, 22 +; RV32IM-NEXT: srli a2, a4, 15 +; RV32IM-NEXT: andi t1, a2, 1 +; RV32IM-NEXT: lh a2, 4(a1) +; RV32IM-NEXT: add t1, a4, t1 +; RV32IM-NEXT: mul a4, t1, a6 +; RV32IM-NEXT: sub t2, a5, a4 +; RV32IM-NEXT: mul a5, a2, a3 +; RV32IM-NEXT: srli a5, a5, 16 +; RV32IM-NEXT: add a5, a5, a2 +; RV32IM-NEXT: slli a5, a5, 16 +; RV32IM-NEXT: srai a5, a5, 22 +; RV32IM-NEXT: srli a4, a5, 15 +; RV32IM-NEXT: andi a4, a4, 1 +; RV32IM-NEXT: lh a1, 0(a1) ; RV32IM-NEXT: add a4, a5, a4 -; RV32IM-NEXT: add a1, a3, a1 -; RV32IM-NEXT: add a2, t2, a2 -; RV32IM-NEXT: add a3, t1, t0 -; RV32IM-NEXT: sh a3, 6(a0) -; RV32IM-NEXT: sh a2, 4(a0) -; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a4, 0(a0) +; RV32IM-NEXT: mul a5, a4, a6 +; RV32IM-NEXT: sub a2, a2, a5 +; RV32IM-NEXT: mul a3, a1, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a1 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 22 +; RV32IM-NEXT: srli a5, a3, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a3, a3, a5 +; RV32IM-NEXT: mul a5, a3, a6 +; RV32IM-NEXT: sub a1, a1, a5 +; RV32IM-NEXT: add a1, a1, a3 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: add a3, t2, t1 +; RV32IM-NEXT: add a4, t0, a7 +; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: sh a1, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: combine_srem_sdiv: @@ -544,50 +750,78 @@ ; RV64I-NEXT: sd s7, 24(sp) ; RV64I-NEXT: sd s8, 16(sp) ; RV64I-NEXT: sd s9, 8(sp) -; RV64I-NEXT: lh s2, 0(a1) -; RV64I-NEXT: lh s3, 8(a1) -; RV64I-NEXT: lh s4, 16(a1) +; RV64I-NEXT: mv s4, a1 +; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: lh s1, 24(a1) -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: lui a0, 1048571 +; RV64I-NEXT: addiw s0, a0, -905 ; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s5, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s6, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s7, a0 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add s2, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s8, a0 -; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s7, s1, a0 +; RV64I-NEXT: lh s1, 16(s4) ; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __divdi3 -; RV64I-NEXT: mv s9, a0 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add s5, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: call __divdi3 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv a0, s5 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s8, s1, a0 +; RV64I-NEXT: lh s1, 8(s4) +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add s6, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __divdi3 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv a0, s6 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s9, s1, a0 +; RV64I-NEXT: lh s1, 0(s4) +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add s0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __divdi3 -; RV64I-NEXT: add a0, s8, a0 -; RV64I-NEXT: add a1, s7, s1 -; RV64I-NEXT: add a2, s6, s4 -; RV64I-NEXT: add a3, s5, s9 -; RV64I-NEXT: sh a3, 6(s0) -; RV64I-NEXT: sh a2, 4(s0) -; RV64I-NEXT: sh a1, 2(s0) -; RV64I-NEXT: sh a0, 0(s0) +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s1, a0 +; RV64I-NEXT: add a0, a0, s0 +; RV64I-NEXT: add a1, s9, s6 +; RV64I-NEXT: add a2, s8, s5 +; RV64I-NEXT: add a3, s7, s2 +; RV64I-NEXT: sh a3, 6(s3) +; RV64I-NEXT: sh a2, 4(s3) +; RV64I-NEXT: sh a1, 2(s3) +; RV64I-NEXT: sh a0, 0(s3) ; RV64I-NEXT: ld s9, 8(sp) ; RV64I-NEXT: ld s8, 16(sp) ; RV64I-NEXT: ld s7, 24(sp) @@ -604,55 +838,61 @@ ; ; RV64IM-LABEL: combine_srem_sdiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a6, 0(a1) -; RV64IM-NEXT: lh a7, 8(a1) -; RV64IM-NEXT: lh a4, 16(a1) -; RV64IM-NEXT: lh a1, 24(a1) -; RV64IM-NEXT: lui a5, 1045903 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -905 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -1767 -; RV64IM-NEXT: mulh a2, a1, a5 -; RV64IM-NEXT: add a2, a2, a1 -; RV64IM-NEXT: srli a3, a2, 63 -; RV64IM-NEXT: srai a2, a2, 6 -; RV64IM-NEXT: add t3, a2, a3 -; RV64IM-NEXT: addi t0, zero, 95 -; RV64IM-NEXT: mul a3, t3, t0 -; RV64IM-NEXT: sub t1, a1, a3 -; RV64IM-NEXT: mulh a3, a4, a5 -; RV64IM-NEXT: add a3, a3, a4 -; RV64IM-NEXT: srli a1, a3, 63 -; RV64IM-NEXT: srai a3, a3, 6 -; RV64IM-NEXT: add a1, a3, a1 -; RV64IM-NEXT: mul a3, a1, t0 -; RV64IM-NEXT: sub t2, a4, a3 -; RV64IM-NEXT: mulh a4, a7, a5 -; RV64IM-NEXT: add a4, a4, a7 -; RV64IM-NEXT: srli a3, a4, 63 -; RV64IM-NEXT: srai a4, a4, 6 -; RV64IM-NEXT: add a3, a4, a3 -; RV64IM-NEXT: mul a4, a3, t0 -; RV64IM-NEXT: sub a4, a7, a4 -; RV64IM-NEXT: mulh a5, a6, a5 -; RV64IM-NEXT: add a5, a5, a6 -; RV64IM-NEXT: srli a2, a5, 63 -; RV64IM-NEXT: srai a5, a5, 6 -; RV64IM-NEXT: add a2, a5, a2 -; RV64IM-NEXT: mul a5, a2, t0 -; RV64IM-NEXT: sub a5, a6, a5 -; RV64IM-NEXT: add a2, a5, a2 -; RV64IM-NEXT: add a3, a4, a3 -; RV64IM-NEXT: add a1, t2, a1 -; RV64IM-NEXT: add a4, t1, t3 +; RV64IM-NEXT: lh a2, 24(a1) +; RV64IM-NEXT: lui a3, 1048571 +; RV64IM-NEXT: addiw a3, a3, -905 +; RV64IM-NEXT: mul a4, a2, a3 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: add a4, a4, a2 +; RV64IM-NEXT: slli a4, a4, 48 +; RV64IM-NEXT: srai a4, a4, 54 +; RV64IM-NEXT: srli a5, a4, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a7, a4, a5 +; RV64IM-NEXT: lh a5, 16(a1) +; RV64IM-NEXT: addi a6, zero, 95 +; RV64IM-NEXT: mul a4, a7, a6 +; RV64IM-NEXT: sub t0, a2, a4 +; RV64IM-NEXT: mul a4, a5, a3 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: slli a4, a4, 48 +; RV64IM-NEXT: srai a4, a4, 54 +; RV64IM-NEXT: srli a2, a4, 15 +; RV64IM-NEXT: andi t1, a2, 1 +; RV64IM-NEXT: lh a2, 8(a1) +; RV64IM-NEXT: add t1, a4, t1 +; RV64IM-NEXT: mul a4, t1, a6 +; RV64IM-NEXT: sub t2, a5, a4 +; RV64IM-NEXT: mul a5, a2, a3 +; RV64IM-NEXT: srli a5, a5, 16 +; RV64IM-NEXT: add a5, a5, a2 +; RV64IM-NEXT: slli a5, a5, 48 +; RV64IM-NEXT: srai a5, a5, 54 +; RV64IM-NEXT: srli a4, a5, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: lh a1, 0(a1) +; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: mul a5, a4, a6 +; RV64IM-NEXT: sub a2, a2, a5 +; RV64IM-NEXT: mul a3, a1, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a1 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 54 +; RV64IM-NEXT: srli a5, a3, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: mul a5, a3, a6 +; RV64IM-NEXT: sub a1, a1, a5 +; RV64IM-NEXT: add a1, a1, a3 +; RV64IM-NEXT: add a2, a2, a4 +; RV64IM-NEXT: add a3, t2, t1 +; RV64IM-NEXT: add a4, t0, a7 ; RV64IM-NEXT: sh a4, 6(a0) -; RV64IM-NEXT: sh a1, 4(a0) -; RV64IM-NEXT: sh a3, 2(a0) -; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a3, 4(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a1, 0(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, @@ -670,33 +910,64 @@ ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: lh a2, 0(a1) -; RV32I-NEXT: lh a0, 12(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: slli a0, a2, 15 +; RV32I-NEXT: sub a0, a2, a0 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 21 +; RV32I-NEXT: srli a3, a0, 15 +; RV32I-NEXT: andi a3, a3, 1 +; RV32I-NEXT: lh a4, 4(a1) +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: slli a0, a0, 6 +; RV32I-NEXT: sub s2, a2, a0 +; RV32I-NEXT: slli a0, a4, 15 +; RV32I-NEXT: sub a0, a4, a0 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, a4 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 20 +; RV32I-NEXT: srli a2, a0, 15 +; RV32I-NEXT: andi a2, a2, 1 ; RV32I-NEXT: lh a3, 8(a1) -; RV32I-NEXT: lh a1, 4(a1) -; RV32I-NEXT: srli a4, a2, 26 -; RV32I-NEXT: add a4, a2, a4 -; RV32I-NEXT: lui a6, 16 -; RV32I-NEXT: addi a5, a6, -64 -; RV32I-NEXT: and a4, a4, a5 -; RV32I-NEXT: sub s2, a2, a4 -; RV32I-NEXT: srli a2, a1, 27 -; RV32I-NEXT: add a2, a1, a2 -; RV32I-NEXT: addi a4, a6, -32 -; RV32I-NEXT: and a2, a2, a4 -; RV32I-NEXT: sub s3, a1, a2 -; RV32I-NEXT: srli a1, a3, 29 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: addi a2, a6, -8 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: sub s1, a3, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: slli a0, a0, 5 +; RV32I-NEXT: sub s3, a4, a0 +; RV32I-NEXT: slli a0, a3, 15 +; RV32I-NEXT: sub a0, a3, a0 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, a3 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 18 +; RV32I-NEXT: srli a2, a0, 15 +; RV32I-NEXT: andi a2, a2, 1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: slli a0, a0, 3 +; RV32I-NEXT: sub s4, a3, a0 +; RV32I-NEXT: lh s1, 12(a1) +; RV32I-NEXT: lui a0, 1048571 +; RV32I-NEXT: addi a1, a0, -905 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 22 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s1, a0 ; RV32I-NEXT: sh a0, 6(s0) -; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh s4, 4(s0) ; RV32I-NEXT: sh s3, 2(s0) ; RV32I-NEXT: sh s2, 0(s0) +; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -707,40 +978,59 @@ ; ; RV32IM-LABEL: dont_fold_srem_power_of_two: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a6, 8(a1) -; RV32IM-NEXT: lh a3, 4(a1) -; RV32IM-NEXT: lh a4, 12(a1) -; RV32IM-NEXT: lh a1, 0(a1) -; RV32IM-NEXT: lui a5, 706409 -; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a5, a4, a5 -; RV32IM-NEXT: add a5, a5, a4 -; RV32IM-NEXT: srli a2, a5, 31 -; RV32IM-NEXT: srli a5, a5, 6 -; RV32IM-NEXT: add a2, a5, a2 +; RV32IM-NEXT: lh a2, 0(a1) +; RV32IM-NEXT: lui a3, 1048568 +; RV32IM-NEXT: addi a3, a3, 1 +; RV32IM-NEXT: mul a4, a2, a3 +; RV32IM-NEXT: srli a4, a4, 16 +; RV32IM-NEXT: add a4, a4, a2 +; RV32IM-NEXT: slli a4, a4, 16 +; RV32IM-NEXT: srai a4, a4, 21 +; RV32IM-NEXT: srli a5, a4, 15 +; RV32IM-NEXT: andi a6, a5, 1 +; RV32IM-NEXT: lh a5, 4(a1) +; RV32IM-NEXT: add a4, a4, a6 +; RV32IM-NEXT: slli a4, a4, 6 +; RV32IM-NEXT: sub a6, a2, a4 +; RV32IM-NEXT: mul a4, a5, a3 +; RV32IM-NEXT: srli a4, a4, 16 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: slli a4, a4, 16 +; RV32IM-NEXT: srai a4, a4, 20 +; RV32IM-NEXT: srli a2, a4, 15 +; RV32IM-NEXT: andi a7, a2, 1 +; RV32IM-NEXT: lh a2, 8(a1) +; RV32IM-NEXT: add a4, a4, a7 +; RV32IM-NEXT: slli a4, a4, 5 +; RV32IM-NEXT: sub a4, a5, a4 +; RV32IM-NEXT: mul a3, a2, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a2 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 18 +; RV32IM-NEXT: srli a5, a3, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a3, a3, a5 +; RV32IM-NEXT: slli a3, a3, 3 +; RV32IM-NEXT: lh a1, 12(a1) +; RV32IM-NEXT: sub a2, a2, a3 +; RV32IM-NEXT: lui a3, 1048571 +; RV32IM-NEXT: addi a3, a3, -905 +; RV32IM-NEXT: mul a3, a1, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a1 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 22 +; RV32IM-NEXT: srli a5, a3, 15 +; RV32IM-NEXT: andi a5, a5, 1 +; RV32IM-NEXT: add a3, a3, a5 ; RV32IM-NEXT: addi a5, zero, 95 -; RV32IM-NEXT: mul a2, a2, a5 -; RV32IM-NEXT: sub a7, a4, a2 -; RV32IM-NEXT: srli a4, a1, 26 -; RV32IM-NEXT: add a4, a1, a4 -; RV32IM-NEXT: lui a5, 16 -; RV32IM-NEXT: addi a2, a5, -64 -; RV32IM-NEXT: and a2, a4, a2 -; RV32IM-NEXT: sub a1, a1, a2 -; RV32IM-NEXT: srli a2, a3, 27 -; RV32IM-NEXT: add a2, a3, a2 -; RV32IM-NEXT: addi a4, a5, -32 -; RV32IM-NEXT: and a2, a2, a4 -; RV32IM-NEXT: sub a2, a3, a2 -; RV32IM-NEXT: srli a3, a6, 29 -; RV32IM-NEXT: add a3, a6, a3 -; RV32IM-NEXT: addi a4, a5, -8 -; RV32IM-NEXT: and a3, a3, a4 -; RV32IM-NEXT: sub a3, a6, a3 -; RV32IM-NEXT: sh a3, 4(a0) -; RV32IM-NEXT: sh a2, 2(a0) -; RV32IM-NEXT: sh a1, 0(a0) -; RV32IM-NEXT: sh a7, 6(a0) +; RV32IM-NEXT: mul a3, a3, a5 +; RV32IM-NEXT: sub a1, a1, a3 +; RV32IM-NEXT: sh a1, 6(a0) +; RV32IM-NEXT: sh a2, 4(a0) +; RV32IM-NEXT: sh a4, 2(a0) +; RV32IM-NEXT: sh a6, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_srem_power_of_two: @@ -751,33 +1041,64 @@ ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) ; RV64I-NEXT: sd s3, 8(sp) -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: sd s4, 0(sp) ; RV64I-NEXT: lh a2, 0(a1) -; RV64I-NEXT: lh a0, 24(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: slli a0, a2, 15 +; RV64I-NEXT: sub a0, a2, a0 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 53 +; RV64I-NEXT: srli a3, a0, 15 +; RV64I-NEXT: andi a3, a3, 1 +; RV64I-NEXT: lh a4, 8(a1) +; RV64I-NEXT: add a0, a0, a3 +; RV64I-NEXT: slli a0, a0, 6 +; RV64I-NEXT: sub s2, a2, a0 +; RV64I-NEXT: slli a0, a4, 15 +; RV64I-NEXT: sub a0, a4, a0 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 52 +; RV64I-NEXT: srli a2, a0, 15 +; RV64I-NEXT: andi a2, a2, 1 ; RV64I-NEXT: lh a3, 16(a1) -; RV64I-NEXT: lh a1, 8(a1) -; RV64I-NEXT: srli a4, a2, 58 -; RV64I-NEXT: add a4, a2, a4 -; RV64I-NEXT: lui a6, 16 -; RV64I-NEXT: addiw a5, a6, -64 -; RV64I-NEXT: and a4, a4, a5 -; RV64I-NEXT: sub s2, a2, a4 -; RV64I-NEXT: srli a2, a1, 59 -; RV64I-NEXT: add a2, a1, a2 -; RV64I-NEXT: addiw a4, a6, -32 -; RV64I-NEXT: and a2, a2, a4 -; RV64I-NEXT: sub s3, a1, a2 -; RV64I-NEXT: srli a1, a3, 61 -; RV64I-NEXT: add a1, a3, a1 -; RV64I-NEXT: addiw a2, a6, -8 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub s1, a3, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: slli a0, a0, 5 +; RV64I-NEXT: sub s3, a4, a0 +; RV64I-NEXT: slli a0, a3, 15 +; RV64I-NEXT: sub a0, a3, a0 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, a3 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 50 +; RV64I-NEXT: srli a2, a0, 15 +; RV64I-NEXT: andi a2, a2, 1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: slli a0, a0, 3 +; RV64I-NEXT: sub s4, a3, a0 +; RV64I-NEXT: lh s1, 24(a1) +; RV64I-NEXT: lui a0, 1048571 +; RV64I-NEXT: addiw a1, a0, -905 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 54 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s1, a0 ; RV64I-NEXT: sh a0, 6(s0) -; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh s4, 4(s0) ; RV64I-NEXT: sh s3, 2(s0) ; RV64I-NEXT: sh s2, 0(s0) +; RV64I-NEXT: ld s4, 0(sp) ; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) @@ -788,46 +1109,60 @@ ; ; RV64IM-LABEL: dont_fold_srem_power_of_two: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a6, 16(a1) -; RV64IM-NEXT: lh a3, 8(a1) -; RV64IM-NEXT: lh a4, 0(a1) +; RV64IM-NEXT: lh a2, 0(a1) +; RV64IM-NEXT: slli a3, a2, 15 +; RV64IM-NEXT: sub a3, a2, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a2 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 53 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: lh a5, 8(a1) +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: slli a3, a3, 6 +; RV64IM-NEXT: sub a6, a2, a3 +; RV64IM-NEXT: slli a3, a5, 15 +; RV64IM-NEXT: sub a3, a5, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 52 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: lh a2, 16(a1) +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: slli a3, a3, 5 +; RV64IM-NEXT: sub a3, a5, a3 +; RV64IM-NEXT: slli a4, a2, 15 +; RV64IM-NEXT: sub a4, a2, a4 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: add a4, a4, a2 +; RV64IM-NEXT: slli a4, a4, 48 +; RV64IM-NEXT: srai a4, a4, 50 +; RV64IM-NEXT: srli a5, a4, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: slli a4, a4, 3 ; RV64IM-NEXT: lh a1, 24(a1) -; RV64IM-NEXT: lui a5, 1045903 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -905 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -1767 -; RV64IM-NEXT: mulh a5, a1, a5 -; RV64IM-NEXT: add a5, a5, a1 -; RV64IM-NEXT: srli a2, a5, 63 -; RV64IM-NEXT: srli a5, a5, 6 -; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: lui a4, 1048571 +; RV64IM-NEXT: addiw a4, a4, -905 +; RV64IM-NEXT: mul a4, a1, a4 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: add a4, a4, a1 +; RV64IM-NEXT: slli a4, a4, 48 +; RV64IM-NEXT: srai a4, a4, 54 +; RV64IM-NEXT: srli a5, a4, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a4, a4, a5 ; RV64IM-NEXT: addi a5, zero, 95 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a7, a1, a2 -; RV64IM-NEXT: srli a2, a4, 58 -; RV64IM-NEXT: add a2, a4, a2 -; RV64IM-NEXT: lui a5, 16 -; RV64IM-NEXT: addiw a1, a5, -64 -; RV64IM-NEXT: and a1, a2, a1 -; RV64IM-NEXT: sub a1, a4, a1 -; RV64IM-NEXT: srli a2, a3, 59 -; RV64IM-NEXT: add a2, a3, a2 -; RV64IM-NEXT: addiw a4, a5, -32 -; RV64IM-NEXT: and a2, a2, a4 -; RV64IM-NEXT: sub a2, a3, a2 -; RV64IM-NEXT: srli a3, a6, 61 -; RV64IM-NEXT: add a3, a6, a3 -; RV64IM-NEXT: addiw a4, a5, -8 -; RV64IM-NEXT: and a3, a3, a4 -; RV64IM-NEXT: sub a3, a6, a3 -; RV64IM-NEXT: sh a3, 4(a0) -; RV64IM-NEXT: sh a2, 2(a0) -; RV64IM-NEXT: sh a1, 0(a0) -; RV64IM-NEXT: sh a7, 6(a0) +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a6, 0(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -843,26 +1178,55 @@ ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: lh s2, 12(a1) -; RV32I-NEXT: lh s1, 8(a1) -; RV32I-NEXT: lh a2, 4(a1) -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, 654 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: sw s4, 8(sp) +; RV32I-NEXT: sw s5, 4(sp) +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lh s0, 8(a1) +; RV32I-NEXT: lui a0, 1048571 +; RV32I-NEXT: addi a1, a0, 535 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s0 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 20 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 23 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a1, a0, 1327 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: sh zero, 0(s0) -; RV32I-NEXT: sh a0, 6(s0) -; RV32I-NEXT: sh s1, 4(s0) -; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s3, s0, a0 +; RV32I-NEXT: lh s0, 4(s1) +; RV32I-NEXT: lui s4, 3 +; RV32I-NEXT: addi a1, s4, 539 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srai a0, a0, 23 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: addi a1, zero, 654 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s5, s0, a0 +; RV32I-NEXT: lh s0, 12(s1) +; RV32I-NEXT: addi a1, s4, 87 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srai a0, a0, 26 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a1, a1, 1327 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: sh zero, 0(s2) +; RV32I-NEXT: sh a0, 6(s2) +; RV32I-NEXT: sh s5, 2(s2) +; RV32I-NEXT: sh s3, 4(s2) +; RV32I-NEXT: lw s5, 4(sp) +; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -873,137 +1237,155 @@ ; ; RV32IM-LABEL: dont_fold_srem_one: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a2, 12(a1) -; RV32IM-NEXT: lh a3, 4(a1) -; RV32IM-NEXT: lh a1, 8(a1) -; RV32IM-NEXT: lui a4, 820904 -; RV32IM-NEXT: addi a4, a4, -1903 -; RV32IM-NEXT: mulh a4, a3, a4 -; RV32IM-NEXT: add a4, a4, a3 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 9 -; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: lh a2, 8(a1) +; RV32IM-NEXT: lui a3, 1048571 +; RV32IM-NEXT: addi a3, a3, 535 +; RV32IM-NEXT: mul a3, a2, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a2 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 20 +; RV32IM-NEXT: srli a4, a3, 15 +; RV32IM-NEXT: andi a4, a4, 1 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: addi a4, zero, 23 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: lh a4, 4(a1) +; RV32IM-NEXT: sub a6, a2, a3 +; RV32IM-NEXT: lui a3, 3 +; RV32IM-NEXT: addi a5, a3, 539 +; RV32IM-NEXT: mul a5, a4, a5 +; RV32IM-NEXT: srai a5, a5, 23 +; RV32IM-NEXT: srli a2, a5, 15 +; RV32IM-NEXT: andi a2, a2, 1 +; RV32IM-NEXT: add a2, a5, a2 ; RV32IM-NEXT: addi a5, zero, 654 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: lui a4, 729444 -; RV32IM-NEXT: addi a4, a4, 713 -; RV32IM-NEXT: mulh a4, a1, a4 -; RV32IM-NEXT: add a4, a4, a1 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 4 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: addi a5, zero, 23 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 -; RV32IM-NEXT: lui a4, 395996 -; RV32IM-NEXT: addi a4, a4, -2009 -; RV32IM-NEXT: mulh a4, a2, a4 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 11 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: lui a5, 1 -; RV32IM-NEXT: addi a5, a5, 1327 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: lh a1, 12(a1) +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: sub a2, a4, a2 +; RV32IM-NEXT: addi a3, a3, 87 +; RV32IM-NEXT: mul a3, a1, a3 +; RV32IM-NEXT: srai a3, a3, 26 +; RV32IM-NEXT: srli a4, a3, 15 +; RV32IM-NEXT: andi a4, a4, 1 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: lui a4, 1 +; RV32IM-NEXT: addi a4, a4, 1327 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: sub a1, a1, a3 ; RV32IM-NEXT: sh zero, 0(a0) -; RV32IM-NEXT: sh a2, 6(a0) -; RV32IM-NEXT: sh a1, 4(a0) -; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a1, 6(a0) +; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: sh a6, 4(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_srem_one: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) -; RV64I-NEXT: lh s2, 24(a1) -; RV64I-NEXT: lh s1, 16(a1) -; RV64I-NEXT: lh a2, 8(a1) -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, 654 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: lh s0, 16(a1) +; RV64I-NEXT: lui a0, 1048571 +; RV64I-NEXT: addiw a1, a0, 535 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s0 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 52 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 23 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a1, a0, 1327 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sh zero, 0(s0) -; RV64I-NEXT: sh a0, 6(s0) -; RV64I-NEXT: sh s1, 4(s0) -; RV64I-NEXT: sh s3, 2(s0) -; RV64I-NEXT: ld s3, 8(sp) -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s3, s0, a0 +; RV64I-NEXT: lh s0, 8(s1) +; RV64I-NEXT: lui s4, 3 +; RV64I-NEXT: addiw a1, s4, 539 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srai a0, a0, 23 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s5, s0, a0 +; RV64I-NEXT: lh s0, 24(s1) +; RV64I-NEXT: addiw a1, s4, 87 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srai a0, a0, 26 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: sh zero, 0(s2) +; RV64I-NEXT: sh a0, 6(s2) +; RV64I-NEXT: sh s5, 2(s2) +; RV64I-NEXT: sh s3, 4(s2) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: dont_fold_srem_one: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a2, 24(a1) -; RV64IM-NEXT: lh a3, 8(a1) -; RV64IM-NEXT: lh a1, 16(a1) -; RV64IM-NEXT: lui a4, 1043590 -; RV64IM-NEXT: addiw a4, a4, -1781 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1069 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -1959 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 357 -; RV64IM-NEXT: mulh a4, a1, a4 -; RV64IM-NEXT: add a4, a4, a1 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 4 -; RV64IM-NEXT: add a4, a4, a5 -; RV64IM-NEXT: addi a5, zero, 23 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a1, a1, a4 -; RV64IM-NEXT: lui a4, 6413 -; RV64IM-NEXT: addiw a4, a4, 1265 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1027 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1077 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 965 -; RV64IM-NEXT: mulh a4, a3, a4 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 8 -; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: lh a2, 16(a1) +; RV64IM-NEXT: lui a3, 1048571 +; RV64IM-NEXT: addiw a3, a3, 535 +; RV64IM-NEXT: mul a3, a2, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a2 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 52 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: addi a4, zero, 23 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: lh a4, 8(a1) +; RV64IM-NEXT: sub a6, a2, a3 +; RV64IM-NEXT: lui a3, 3 +; RV64IM-NEXT: addiw a5, a3, 539 +; RV64IM-NEXT: mul a5, a4, a5 +; RV64IM-NEXT: srai a5, a5, 23 +; RV64IM-NEXT: srli a2, a5, 15 +; RV64IM-NEXT: andi a2, a2, 1 +; RV64IM-NEXT: add a2, a5, a2 ; RV64IM-NEXT: addi a5, zero, 654 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a3, a3, a4 -; RV64IM-NEXT: lui a4, 12375 -; RV64IM-NEXT: addiw a4, a4, -575 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 883 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, -431 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 1959 -; RV64IM-NEXT: mulh a4, a2, a4 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 11 -; RV64IM-NEXT: add a4, a4, a5 -; RV64IM-NEXT: lui a5, 1 -; RV64IM-NEXT: addiw a5, a5, 1327 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a2, a4, a2 +; RV64IM-NEXT: addiw a3, a3, 87 +; RV64IM-NEXT: mul a3, a1, a3 +; RV64IM-NEXT: srai a3, a3, 26 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: andi a4, a4, 1 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: lui a4, 1 +; RV64IM-NEXT: addiw a4, a4, 1327 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: sub a1, a1, a3 ; RV64IM-NEXT: sh zero, 0(a0) -; RV64IM-NEXT: sh a2, 6(a0) -; RV64IM-NEXT: sh a3, 2(a0) -; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a6, 4(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -1019,26 +1401,51 @@ ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) -; RV32I-NEXT: lh a2, 4(a1) -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lh s2, 12(a1) -; RV32I-NEXT: lh a0, 8(a1) -; RV32I-NEXT: srli a1, a2, 17 -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: lui a3, 8 -; RV32I-NEXT: and a1, a1, a3 -; RV32I-NEXT: sub s3, a2, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: lh a1, 4(a1) +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: slli a0, a1, 15 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 30 +; RV32I-NEXT: srli a2, a0, 15 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: slli a0, a0, 15 +; RV32I-NEXT: add s3, a1, a0 +; RV32I-NEXT: lh s1, 8(s0) +; RV32I-NEXT: lui a0, 1048571 +; RV32I-NEXT: addi a1, a0, 535 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 20 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: addi a1, zero, 23 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a1, a0, 1327 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: sh zero, 0(s0) -; RV32I-NEXT: sh a0, 6(s0) -; RV32I-NEXT: sh s1, 4(s0) -; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s1, s1, a0 +; RV32I-NEXT: lh s0, 12(s0) +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a1, a0, 87 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srai a0, a0, 26 +; RV32I-NEXT: srli a1, a0, 15 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a1, a1, 1327 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: sh zero, 0(s2) +; RV32I-NEXT: sh a0, 6(s2) +; RV32I-NEXT: sh s1, 4(s2) +; RV32I-NEXT: sh s3, 2(s2) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -1049,38 +1456,48 @@ ; ; RV32IM-LABEL: dont_fold_urem_i16_smax: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a2, 4(a1) -; RV32IM-NEXT: lh a3, 8(a1) -; RV32IM-NEXT: lh a1, 12(a1) -; RV32IM-NEXT: lui a4, 729444 -; RV32IM-NEXT: addi a4, a4, 713 -; RV32IM-NEXT: mulh a4, a3, a4 -; RV32IM-NEXT: add a4, a4, a3 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 4 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: addi a5, zero, 23 -; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: lh a2, 8(a1) +; RV32IM-NEXT: lui a3, 1048571 +; RV32IM-NEXT: addi a3, a3, 535 +; RV32IM-NEXT: mul a3, a2, a3 +; RV32IM-NEXT: srli a3, a3, 16 +; RV32IM-NEXT: add a3, a3, a2 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 20 +; RV32IM-NEXT: srli a4, a3, 15 +; RV32IM-NEXT: andi a4, a4, 1 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: addi a4, zero, 23 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: lh a4, 4(a1) +; RV32IM-NEXT: sub a2, a2, a3 +; RV32IM-NEXT: lui a3, 8 +; RV32IM-NEXT: addi a3, a3, -1 +; RV32IM-NEXT: mul a3, a4, a3 +; RV32IM-NEXT: srli a3, a3, 16 ; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: lui a4, 395996 -; RV32IM-NEXT: addi a4, a4, -2009 -; RV32IM-NEXT: mulh a4, a1, a4 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srli a4, a4, 11 +; RV32IM-NEXT: slli a3, a3, 16 +; RV32IM-NEXT: srai a3, a3, 30 +; RV32IM-NEXT: srli a5, a3, 15 +; RV32IM-NEXT: add a3, a3, a5 +; RV32IM-NEXT: slli a3, a3, 15 +; RV32IM-NEXT: lh a1, 12(a1) +; RV32IM-NEXT: add a3, a4, a3 +; RV32IM-NEXT: lui a4, 3 +; RV32IM-NEXT: addi a4, a4, 87 +; RV32IM-NEXT: mul a4, a1, a4 +; RV32IM-NEXT: srai a4, a4, 26 +; RV32IM-NEXT: srli a5, a4, 15 +; RV32IM-NEXT: andi a5, a5, 1 ; RV32IM-NEXT: add a4, a4, a5 ; RV32IM-NEXT: lui a5, 1 ; RV32IM-NEXT: addi a5, a5, 1327 ; RV32IM-NEXT: mul a4, a4, a5 ; RV32IM-NEXT: sub a1, a1, a4 -; RV32IM-NEXT: srli a4, a2, 17 -; RV32IM-NEXT: add a4, a2, a4 -; RV32IM-NEXT: lui a5, 8 -; RV32IM-NEXT: and a4, a4, a5 -; RV32IM-NEXT: sub a2, a2, a4 ; RV32IM-NEXT: sh zero, 0(a0) -; RV32IM-NEXT: sh a2, 2(a0) ; RV32IM-NEXT: sh a1, 6(a0) -; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a2, 4(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_i16_smax: @@ -1091,26 +1508,51 @@ ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) ; RV64I-NEXT: sd s3, 8(sp) -; RV64I-NEXT: lh a2, 8(a1) -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lh s2, 24(a1) -; RV64I-NEXT: lh a0, 16(a1) -; RV64I-NEXT: srli a1, a2, 49 -; RV64I-NEXT: add a1, a2, a1 -; RV64I-NEXT: lui a3, 8 -; RV64I-NEXT: and a1, a1, a3 -; RV64I-NEXT: sub s3, a2, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: lh a1, 8(a1) +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: slli a0, a1, 15 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 62 +; RV64I-NEXT: srli a2, a0, 15 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: slli a0, a0, 15 +; RV64I-NEXT: add s3, a1, a0 +; RV64I-NEXT: lh s1, 16(s0) +; RV64I-NEXT: lui a0, 1048571 +; RV64I-NEXT: addiw a1, a0, 535 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: add a0, a0, s1 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 52 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: addi a1, zero, 23 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a1, a0, 1327 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sh zero, 0(s0) -; RV64I-NEXT: sh a0, 6(s0) -; RV64I-NEXT: sh s1, 4(s0) -; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s1, s1, a0 +; RV64I-NEXT: lh s0, 24(s0) +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a1, a0, 87 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srai a0, a0, 26 +; RV64I-NEXT: srli a1, a0, 15 +; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: sh zero, 0(s2) +; RV64I-NEXT: sh a0, 6(s2) +; RV64I-NEXT: sh s1, 4(s2) +; RV64I-NEXT: sh s3, 2(s2) ; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) @@ -1122,49 +1564,46 @@ ; RV64IM-LABEL: dont_fold_urem_i16_smax: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lh a2, 8(a1) -; RV64IM-NEXT: lh a3, 24(a1) -; RV64IM-NEXT: lh a1, 16(a1) -; RV64IM-NEXT: lui a4, 1043590 -; RV64IM-NEXT: addiw a4, a4, -1781 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1069 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -1959 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 357 -; RV64IM-NEXT: mulh a4, a1, a4 -; RV64IM-NEXT: add a4, a4, a1 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 4 -; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: slli a3, a2, 15 +; RV64IM-NEXT: sub a3, a3, a2 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: sub a3, a3, a2 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 62 +; RV64IM-NEXT: srli a4, a3, 15 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: slli a3, a3, 15 +; RV64IM-NEXT: lh a4, 16(a1) +; RV64IM-NEXT: add a2, a2, a3 +; RV64IM-NEXT: lui a3, 1048571 +; RV64IM-NEXT: addiw a3, a3, 535 +; RV64IM-NEXT: mul a3, a4, a3 +; RV64IM-NEXT: srli a3, a3, 16 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: slli a3, a3, 48 +; RV64IM-NEXT: srai a3, a3, 52 +; RV64IM-NEXT: srli a5, a3, 15 +; RV64IM-NEXT: andi a5, a5, 1 +; RV64IM-NEXT: add a3, a3, a5 ; RV64IM-NEXT: addi a5, zero, 23 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a1, a1, a4 -; RV64IM-NEXT: lui a4, 12375 -; RV64IM-NEXT: addiw a4, a4, -575 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 883 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, -431 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 1959 -; RV64IM-NEXT: mulh a4, a3, a4 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 11 +; RV64IM-NEXT: mul a3, a3, a5 +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: sub a3, a4, a3 +; RV64IM-NEXT: lui a4, 3 +; RV64IM-NEXT: addiw a4, a4, 87 +; RV64IM-NEXT: mul a4, a1, a4 +; RV64IM-NEXT: srai a4, a4, 26 +; RV64IM-NEXT: srli a5, a4, 15 +; RV64IM-NEXT: andi a5, a5, 1 ; RV64IM-NEXT: add a4, a4, a5 ; RV64IM-NEXT: lui a5, 1 ; RV64IM-NEXT: addiw a5, a5, 1327 ; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a3, a3, a4 -; RV64IM-NEXT: srli a4, a2, 49 -; RV64IM-NEXT: add a4, a2, a4 -; RV64IM-NEXT: lui a5, 8 -; RV64IM-NEXT: and a4, a4, a5 -; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sub a1, a1, a4 ; RV64IM-NEXT: sh zero, 0(a0) +; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: sh a3, 4(a0) ; RV64IM-NEXT: sh a2, 2(a0) -; RV64IM-NEXT: sh a3, 6(a0) -; RV64IM-NEXT: sh a1, 4(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -1174,180 +1613,457 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32I-LABEL: dont_fold_srem_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -48 -; RV32I-NEXT: sw ra, 44(sp) -; RV32I-NEXT: sw s0, 40(sp) -; RV32I-NEXT: sw s1, 36(sp) -; RV32I-NEXT: sw s2, 32(sp) -; RV32I-NEXT: sw s3, 28(sp) -; RV32I-NEXT: sw s4, 24(sp) -; RV32I-NEXT: sw s5, 20(sp) -; RV32I-NEXT: sw s6, 16(sp) -; RV32I-NEXT: sw s7, 12(sp) -; RV32I-NEXT: sw s8, 8(sp) -; RV32I-NEXT: sw s9, 4(sp) -; RV32I-NEXT: lw s2, 24(a1) -; RV32I-NEXT: lw s3, 28(a1) -; RV32I-NEXT: lw s4, 16(a1) -; RV32I-NEXT: lw s5, 20(a1) +; RV32I-NEXT: addi sp, sp, -240 +; RV32I-NEXT: sw ra, 236(sp) +; RV32I-NEXT: sw s0, 232(sp) +; RV32I-NEXT: sw s1, 228(sp) +; RV32I-NEXT: sw s2, 224(sp) +; RV32I-NEXT: sw s3, 220(sp) +; RV32I-NEXT: sw s4, 216(sp) +; RV32I-NEXT: sw s5, 212(sp) +; RV32I-NEXT: sw s6, 208(sp) +; RV32I-NEXT: sw s7, 204(sp) +; RV32I-NEXT: sw s8, 200(sp) +; RV32I-NEXT: sw s9, 196(sp) +; RV32I-NEXT: sw s10, 192(sp) +; RV32I-NEXT: lw s3, 0(a1) +; RV32I-NEXT: lw s5, 4(a1) +; RV32I-NEXT: lw s4, 24(a1) +; RV32I-NEXT: lw s7, 28(a1) ; RV32I-NEXT: lw s6, 8(a1) -; RV32I-NEXT: lw s1, 12(a1) -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: lw s8, 12(a1) +; RV32I-NEXT: lw s2, 16(a1) +; RV32I-NEXT: lw s1, 20(a1) ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a2, zero, 1 -; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: addi a0, zero, -1 +; RV32I-NEXT: sw a0, 108(sp) +; RV32I-NEXT: sw a0, 104(sp) +; RV32I-NEXT: lui a0, 729444 +; RV32I-NEXT: addi a0, a0, 712 +; RV32I-NEXT: sw a0, 100(sp) +; RV32I-NEXT: lui a0, 364722 +; RV32I-NEXT: addi a0, a0, 357 +; RV32I-NEXT: sw a0, 96(sp) +; RV32I-NEXT: sw s1, 116(sp) +; RV32I-NEXT: sw s2, 112(sp) +; RV32I-NEXT: srai a3, s1, 31 +; RV32I-NEXT: sw a3, 124(sp) +; RV32I-NEXT: addi a0, sp, 128 +; RV32I-NEXT: addi a1, sp, 112 +; RV32I-NEXT: addi a2, sp, 96 +; RV32I-NEXT: sw a3, 120(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 60(sp) +; RV32I-NEXT: sw zero, 56(sp) +; RV32I-NEXT: lui a0, 410452 +; RV32I-NEXT: addi a0, a0, -952 +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: lui a0, 25653 +; RV32I-NEXT: addi a0, a0, 965 +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw s8, 68(sp) +; RV32I-NEXT: sw s6, 64(sp) +; RV32I-NEXT: srai a3, s8, 31 +; RV32I-NEXT: sw a3, 76(sp) +; RV32I-NEXT: addi a0, sp, 80 +; RV32I-NEXT: addi a1, sp, 64 +; RV32I-NEXT: addi a2, sp, 48 +; RV32I-NEXT: sw a3, 72(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 156(sp) +; RV32I-NEXT: sw zero, 152(sp) +; RV32I-NEXT: lui a0, 395996 +; RV32I-NEXT: addi a0, a0, -2010 +; RV32I-NEXT: sw a0, 148(sp) +; RV32I-NEXT: lui a0, 941649 +; RV32I-NEXT: addi a0, a0, 1959 +; RV32I-NEXT: sw a0, 144(sp) +; RV32I-NEXT: sw s7, 164(sp) +; RV32I-NEXT: sw s4, 160(sp) +; RV32I-NEXT: srai a3, s7, 31 +; RV32I-NEXT: sw a3, 172(sp) +; RV32I-NEXT: addi a0, sp, 176 +; RV32I-NEXT: addi a1, sp, 160 +; RV32I-NEXT: addi a2, sp, 144 +; RV32I-NEXT: sw a3, 168(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw s5, 20(sp) +; RV32I-NEXT: sw s3, 16(sp) +; RV32I-NEXT: srai a3, s5, 31 +; RV32I-NEXT: sw a3, 28(sp) +; RV32I-NEXT: addi a0, sp, 32 +; RV32I-NEXT: addi a1, sp, 16 +; RV32I-NEXT: mv a2, sp +; RV32I-NEXT: sw a3, 24(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: lw a0, 140(sp) +; RV32I-NEXT: lw a1, 136(sp) +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: add a2, a1, s2 +; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: slli a0, a1, 28 +; RV32I-NEXT: srli a2, a2, 4 +; RV32I-NEXT: or a2, a2, a0 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: srai a1, a1, 4 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: addi a2, zero, 23 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __moddi3 -; RV32I-NEXT: mv s7, a0 -; RV32I-NEXT: mv s8, a1 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sltu a2, s2, a0 +; RV32I-NEXT: sub a1, s1, a1 +; RV32I-NEXT: lw a3, 92(sp) +; RV32I-NEXT: lw a4, 88(sp) +; RV32I-NEXT: sub s9, a1, a2 +; RV32I-NEXT: sub s10, s2, a0 +; RV32I-NEXT: slli a0, a3, 24 +; RV32I-NEXT: srli a1, a4, 8 +; RV32I-NEXT: or a1, a1, a0 +; RV32I-NEXT: srli a0, a3, 31 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: sltu a1, a0, a1 +; RV32I-NEXT: srai a2, a3, 8 +; RV32I-NEXT: add a1, a2, a1 ; RV32I-NEXT: addi a2, zero, 654 -; RV32I-NEXT: mv a0, s6 -; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __moddi3 -; RV32I-NEXT: mv s6, a0 -; RV32I-NEXT: mv s9, a1 -; RV32I-NEXT: addi a2, zero, 23 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s5 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: sub a0, s8, a1 +; RV32I-NEXT: lw a1, 188(sp) +; RV32I-NEXT: lw a2, 184(sp) +; RV32I-NEXT: sltu a3, s6, s2 +; RV32I-NEXT: sub s8, a0, a3 +; RV32I-NEXT: slli a0, a1, 21 +; RV32I-NEXT: srli a2, a2, 11 +; RV32I-NEXT: or a2, a2, a0 +; RV32I-NEXT: srli a0, a1, 31 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: sltu a2, a0, a2 +; RV32I-NEXT: srai a1, a1, 11 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: lui a2, 1 +; RV32I-NEXT: addi a2, a2, 1327 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __moddi3 -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a2, a0, 1327 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: sltu a0, s4, a0 +; RV32I-NEXT: lw a2, 44(sp) +; RV32I-NEXT: lw a3, 40(sp) +; RV32I-NEXT: sub a1, s7, a1 +; RV32I-NEXT: sub s7, a1, a0 +; RV32I-NEXT: add a1, a2, s5 +; RV32I-NEXT: add a0, a3, s3 +; RV32I-NEXT: sltu a2, a0, a3 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: addi a2, zero, 1 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __moddi3 -; RV32I-NEXT: sw a1, 28(s0) -; RV32I-NEXT: sw a0, 24(s0) -; RV32I-NEXT: sw s1, 20(s0) -; RV32I-NEXT: sw s4, 16(s0) -; RV32I-NEXT: sw s9, 12(s0) -; RV32I-NEXT: sw s6, 8(s0) -; RV32I-NEXT: sw s8, 4(s0) -; RV32I-NEXT: sw s7, 0(s0) -; RV32I-NEXT: lw s9, 4(sp) -; RV32I-NEXT: lw s8, 8(sp) -; RV32I-NEXT: lw s7, 12(sp) -; RV32I-NEXT: lw s6, 16(sp) -; RV32I-NEXT: lw s5, 20(sp) -; RV32I-NEXT: lw s4, 24(sp) -; RV32I-NEXT: lw s3, 28(sp) -; RV32I-NEXT: lw s2, 32(sp) -; RV32I-NEXT: lw s1, 36(sp) -; RV32I-NEXT: lw s0, 40(sp) -; RV32I-NEXT: lw ra, 44(sp) -; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sltu a2, s3, a0 +; RV32I-NEXT: sub a1, s5, a1 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a2, s6, s2 +; RV32I-NEXT: sub a3, s4, s1 +; RV32I-NEXT: sub a0, s3, a0 +; RV32I-NEXT: sw a0, 0(s0) +; RV32I-NEXT: sw a1, 4(s0) +; RV32I-NEXT: sw a3, 24(s0) +; RV32I-NEXT: sw a2, 8(s0) +; RV32I-NEXT: sw s7, 28(s0) +; RV32I-NEXT: sw s8, 12(s0) +; RV32I-NEXT: sw s10, 16(s0) +; RV32I-NEXT: sw s9, 20(s0) +; RV32I-NEXT: lw s10, 192(sp) +; RV32I-NEXT: lw s9, 196(sp) +; RV32I-NEXT: lw s8, 200(sp) +; RV32I-NEXT: lw s7, 204(sp) +; RV32I-NEXT: lw s6, 208(sp) +; RV32I-NEXT: lw s5, 212(sp) +; RV32I-NEXT: lw s4, 216(sp) +; RV32I-NEXT: lw s3, 220(sp) +; RV32I-NEXT: lw s2, 224(sp) +; RV32I-NEXT: lw s1, 228(sp) +; RV32I-NEXT: lw s0, 232(sp) +; RV32I-NEXT: lw ra, 236(sp) +; RV32I-NEXT: addi sp, sp, 240 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: dont_fold_srem_i64: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -48 -; RV32IM-NEXT: sw ra, 44(sp) -; RV32IM-NEXT: sw s0, 40(sp) -; RV32IM-NEXT: sw s1, 36(sp) -; RV32IM-NEXT: sw s2, 32(sp) -; RV32IM-NEXT: sw s3, 28(sp) -; RV32IM-NEXT: sw s4, 24(sp) -; RV32IM-NEXT: sw s5, 20(sp) -; RV32IM-NEXT: sw s6, 16(sp) -; RV32IM-NEXT: sw s7, 12(sp) -; RV32IM-NEXT: sw s8, 8(sp) -; RV32IM-NEXT: sw s9, 4(sp) +; RV32IM-NEXT: addi sp, sp, -240 +; RV32IM-NEXT: sw ra, 236(sp) +; RV32IM-NEXT: sw s0, 232(sp) +; RV32IM-NEXT: sw s1, 228(sp) +; RV32IM-NEXT: sw s2, 224(sp) +; RV32IM-NEXT: sw s3, 220(sp) +; RV32IM-NEXT: sw s4, 216(sp) +; RV32IM-NEXT: sw s5, 212(sp) +; RV32IM-NEXT: sw s6, 208(sp) +; RV32IM-NEXT: sw s7, 204(sp) +; RV32IM-NEXT: sw s8, 200(sp) +; RV32IM-NEXT: lw s3, 0(a1) +; RV32IM-NEXT: lw s7, 4(a1) ; RV32IM-NEXT: lw s2, 24(a1) -; RV32IM-NEXT: lw s3, 28(a1) -; RV32IM-NEXT: lw s4, 16(a1) -; RV32IM-NEXT: lw s5, 20(a1) -; RV32IM-NEXT: lw s6, 8(a1) -; RV32IM-NEXT: lw s1, 12(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a1, 4(a1) +; RV32IM-NEXT: lw s5, 28(a1) +; RV32IM-NEXT: lw s4, 8(a1) +; RV32IM-NEXT: lw s6, 12(a1) +; RV32IM-NEXT: lw s8, 16(a1) +; RV32IM-NEXT: lw s1, 20(a1) ; RV32IM-NEXT: mv s0, a0 -; RV32IM-NEXT: addi a2, zero, 1 -; RV32IM-NEXT: mv a0, a3 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __moddi3 -; RV32IM-NEXT: mv s7, a0 -; RV32IM-NEXT: mv s8, a1 -; RV32IM-NEXT: addi a2, zero, 654 -; RV32IM-NEXT: mv a0, s6 -; RV32IM-NEXT: mv a1, s1 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __moddi3 -; RV32IM-NEXT: mv s6, a0 -; RV32IM-NEXT: mv s9, a1 -; RV32IM-NEXT: addi a2, zero, 23 -; RV32IM-NEXT: mv a0, s4 -; RV32IM-NEXT: mv a1, s5 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __moddi3 -; RV32IM-NEXT: mv s4, a0 -; RV32IM-NEXT: mv s1, a1 -; RV32IM-NEXT: lui a0, 1 -; RV32IM-NEXT: addi a2, a0, 1327 -; RV32IM-NEXT: mv a0, s2 -; RV32IM-NEXT: mv a1, s3 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __moddi3 -; RV32IM-NEXT: sw a1, 28(s0) -; RV32IM-NEXT: sw a0, 24(s0) -; RV32IM-NEXT: sw s1, 20(s0) -; RV32IM-NEXT: sw s4, 16(s0) -; RV32IM-NEXT: sw s9, 12(s0) -; RV32IM-NEXT: sw s6, 8(s0) -; RV32IM-NEXT: sw s8, 4(s0) -; RV32IM-NEXT: sw s7, 0(s0) -; RV32IM-NEXT: lw s9, 4(sp) -; RV32IM-NEXT: lw s8, 8(sp) -; RV32IM-NEXT: lw s7, 12(sp) -; RV32IM-NEXT: lw s6, 16(sp) -; RV32IM-NEXT: lw s5, 20(sp) -; RV32IM-NEXT: lw s4, 24(sp) -; RV32IM-NEXT: lw s3, 28(sp) -; RV32IM-NEXT: lw s2, 32(sp) -; RV32IM-NEXT: lw s1, 36(sp) -; RV32IM-NEXT: lw s0, 40(sp) -; RV32IM-NEXT: lw ra, 44(sp) -; RV32IM-NEXT: addi sp, sp, 48 +; RV32IM-NEXT: addi a0, zero, -1 +; RV32IM-NEXT: sw a0, 116(sp) +; RV32IM-NEXT: sw a0, 112(sp) +; RV32IM-NEXT: lui a0, 729444 +; RV32IM-NEXT: addi a0, a0, 712 +; RV32IM-NEXT: sw a0, 108(sp) +; RV32IM-NEXT: lui a0, 364722 +; RV32IM-NEXT: addi a0, a0, 357 +; RV32IM-NEXT: sw a0, 104(sp) +; RV32IM-NEXT: sw s1, 124(sp) +; RV32IM-NEXT: sw s8, 120(sp) +; RV32IM-NEXT: srai a3, s1, 31 +; RV32IM-NEXT: sw a3, 132(sp) +; RV32IM-NEXT: addi a0, sp, 136 +; RV32IM-NEXT: addi a1, sp, 120 +; RV32IM-NEXT: addi a2, sp, 104 +; RV32IM-NEXT: sw a3, 128(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 68(sp) +; RV32IM-NEXT: sw zero, 64(sp) +; RV32IM-NEXT: lui a0, 410452 +; RV32IM-NEXT: addi a0, a0, -952 +; RV32IM-NEXT: sw a0, 60(sp) +; RV32IM-NEXT: lui a0, 25653 +; RV32IM-NEXT: addi a0, a0, 965 +; RV32IM-NEXT: sw a0, 56(sp) +; RV32IM-NEXT: sw s6, 76(sp) +; RV32IM-NEXT: sw s4, 72(sp) +; RV32IM-NEXT: srai a3, s6, 31 +; RV32IM-NEXT: sw a3, 84(sp) +; RV32IM-NEXT: addi a0, sp, 88 +; RV32IM-NEXT: addi a1, sp, 72 +; RV32IM-NEXT: addi a2, sp, 56 +; RV32IM-NEXT: sw a3, 80(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 164(sp) +; RV32IM-NEXT: sw zero, 160(sp) +; RV32IM-NEXT: lui a0, 395996 +; RV32IM-NEXT: addi a0, a0, -2010 +; RV32IM-NEXT: sw a0, 156(sp) +; RV32IM-NEXT: lui a0, 941649 +; RV32IM-NEXT: addi a0, a0, 1959 +; RV32IM-NEXT: sw a0, 152(sp) +; RV32IM-NEXT: sw s5, 172(sp) +; RV32IM-NEXT: sw s2, 168(sp) +; RV32IM-NEXT: srai a3, s5, 31 +; RV32IM-NEXT: sw a3, 180(sp) +; RV32IM-NEXT: addi a0, sp, 184 +; RV32IM-NEXT: addi a1, sp, 168 +; RV32IM-NEXT: addi a2, sp, 152 +; RV32IM-NEXT: sw a3, 176(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 20(sp) +; RV32IM-NEXT: sw zero, 16(sp) +; RV32IM-NEXT: sw zero, 12(sp) +; RV32IM-NEXT: sw zero, 8(sp) +; RV32IM-NEXT: sw s7, 28(sp) +; RV32IM-NEXT: sw s3, 24(sp) +; RV32IM-NEXT: srai a3, s7, 31 +; RV32IM-NEXT: sw a3, 36(sp) +; RV32IM-NEXT: addi a0, sp, 40 +; RV32IM-NEXT: addi a1, sp, 24 +; RV32IM-NEXT: addi a2, sp, 8 +; RV32IM-NEXT: sw a3, 32(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: lw a0, 148(sp) +; RV32IM-NEXT: lw a1, 144(sp) +; RV32IM-NEXT: add a0, a0, s1 +; RV32IM-NEXT: add a2, a1, s8 +; RV32IM-NEXT: sltu a1, a2, a1 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: slli a1, a0, 28 +; RV32IM-NEXT: srli a2, a2, 4 +; RV32IM-NEXT: or a1, a2, a1 +; RV32IM-NEXT: srli a2, a0, 31 +; RV32IM-NEXT: add a2, a1, a2 +; RV32IM-NEXT: addi a3, zero, 23 +; RV32IM-NEXT: mulhu a4, a2, a3 +; RV32IM-NEXT: sltu a1, a2, a1 +; RV32IM-NEXT: srai a0, a0, 4 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: mul a0, a0, a3 +; RV32IM-NEXT: add a0, a4, a0 +; RV32IM-NEXT: sub a0, s1, a0 +; RV32IM-NEXT: mul a1, a2, a3 +; RV32IM-NEXT: sltu a2, s8, a1 +; RV32IM-NEXT: lw a3, 100(sp) +; RV32IM-NEXT: lw a4, 96(sp) +; RV32IM-NEXT: sub a6, a0, a2 +; RV32IM-NEXT: sub a7, s8, a1 +; RV32IM-NEXT: slli a2, a3, 24 +; RV32IM-NEXT: srli a4, a4, 8 +; RV32IM-NEXT: or a2, a4, a2 +; RV32IM-NEXT: srli a4, a3, 31 +; RV32IM-NEXT: add a4, a2, a4 +; RV32IM-NEXT: sltu a2, a4, a2 +; RV32IM-NEXT: srai a3, a3, 8 +; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: addi a3, zero, 654 +; RV32IM-NEXT: mul a2, a2, a3 +; RV32IM-NEXT: mulhu a5, a4, a3 +; RV32IM-NEXT: add a2, a5, a2 +; RV32IM-NEXT: sub a2, s6, a2 +; RV32IM-NEXT: mul a3, a4, a3 +; RV32IM-NEXT: lw a4, 196(sp) +; RV32IM-NEXT: lw a5, 192(sp) +; RV32IM-NEXT: sltu s1, s4, a3 +; RV32IM-NEXT: sub a2, a2, s1 +; RV32IM-NEXT: slli s1, a4, 21 +; RV32IM-NEXT: srli a5, a5, 11 +; RV32IM-NEXT: or a5, a5, s1 +; RV32IM-NEXT: srli s1, a4, 31 +; RV32IM-NEXT: add s1, a5, s1 +; RV32IM-NEXT: sltu a5, s1, a5 +; RV32IM-NEXT: srai a4, a4, 11 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: lui a5, 1 +; RV32IM-NEXT: addi a5, a5, 1327 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: mulhu a0, s1, a5 +; RV32IM-NEXT: add a0, a0, a4 +; RV32IM-NEXT: sub a0, s5, a0 +; RV32IM-NEXT: mul a4, s1, a5 +; RV32IM-NEXT: lw a5, 52(sp) +; RV32IM-NEXT: lw s1, 48(sp) +; RV32IM-NEXT: sltu a1, s2, a4 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: add a1, a5, s7 +; RV32IM-NEXT: add a5, s1, s3 +; RV32IM-NEXT: sltu s1, a5, s1 +; RV32IM-NEXT: add a1, a1, s1 +; RV32IM-NEXT: sub a1, s7, a1 +; RV32IM-NEXT: sltu s1, s3, a5 +; RV32IM-NEXT: sub a1, a1, s1 +; RV32IM-NEXT: sub a3, s4, a3 +; RV32IM-NEXT: sub a4, s2, a4 +; RV32IM-NEXT: sub a5, s3, a5 +; RV32IM-NEXT: sw a5, 0(s0) +; RV32IM-NEXT: sw a4, 24(s0) +; RV32IM-NEXT: sw a3, 8(s0) +; RV32IM-NEXT: sw a1, 4(s0) +; RV32IM-NEXT: sw a7, 16(s0) +; RV32IM-NEXT: sw a0, 28(s0) +; RV32IM-NEXT: sw a2, 12(s0) +; RV32IM-NEXT: sw a6, 20(s0) +; RV32IM-NEXT: lw s8, 200(sp) +; RV32IM-NEXT: lw s7, 204(sp) +; RV32IM-NEXT: lw s6, 208(sp) +; RV32IM-NEXT: lw s5, 212(sp) +; RV32IM-NEXT: lw s4, 216(sp) +; RV32IM-NEXT: lw s3, 220(sp) +; RV32IM-NEXT: lw s2, 224(sp) +; RV32IM-NEXT: lw s1, 228(sp) +; RV32IM-NEXT: lw s0, 232(sp) +; RV32IM-NEXT: lw ra, 236(sp) +; RV32IM-NEXT: addi sp, sp, 240 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_srem_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: ld s4, 0(a1) ; RV64I-NEXT: ld s2, 24(a1) +; RV64I-NEXT: ld s0, 8(a1) ; RV64I-NEXT: ld s1, 16(a1) -; RV64I-NEXT: ld a2, 8(a1) -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, 654 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s3, a0 -; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: srai a1, s1, 63 +; RV64I-NEXT: lui a0, 1043590 +; RV64I-NEXT: addiw a0, a0, -1781 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, 1069 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a0, a0, -1959 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a2, a0, 357 +; RV64I-NEXT: addi a3, zero, -1 ; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: add a0, a1, s1 +; RV64I-NEXT: srli a1, a0, 63 +; RV64I-NEXT: srai a0, a0, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s5, s1, a0 +; RV64I-NEXT: srai a1, s0, 63 +; RV64I-NEXT: lui a0, 6413 +; RV64I-NEXT: addiw a0, a0, 1265 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, 1027 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, 1077 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a2, a0, 965 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: srli a0, a1, 63 +; RV64I-NEXT: srai a1, a1, 8 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s0, s0, a0 +; RV64I-NEXT: srai a1, s2, 63 +; RV64I-NEXT: lui a0, 12375 +; RV64I-NEXT: addiw a0, a0, -575 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a0, a0, 883 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, -431 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a2, a0, 1959 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sd zero, 0(s0) -; RV64I-NEXT: sd a0, 24(s0) -; RV64I-NEXT: sd s1, 16(s0) -; RV64I-NEXT: sd s3, 8(s0) -; RV64I-NEXT: ld s3, 8(sp) -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: srli a0, a1, 63 +; RV64I-NEXT: srai a1, a1, 11 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s1, s2, a0 +; RV64I-NEXT: srai a1, s4, 63 +; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: mv a2, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: add a0, a1, s4 +; RV64I-NEXT: sub a0, s4, a0 +; RV64I-NEXT: sd a0, 0(s3) +; RV64I-NEXT: sd s1, 24(s3) +; RV64I-NEXT: sd s0, 8(s3) +; RV64I-NEXT: sd s5, 16(s3) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: dont_fold_srem_i64: diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll --- a/llvm/test/CodeGen/RISCV/urem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll @@ -13,8 +13,22 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lui a0, 364242 +; RV32I-NEXT: addi a2, a0, 777 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a1, zero +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sub a0, s0, a1 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 6 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -37,34 +51,41 @@ ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: lui a1, 364242 +; RV64I-NEXT: addiw a1, a1, 777 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: sub a1, s0, a0 +; RV64I-NEXT: srliw a1, a1, 1 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: srli a0, a0, 6 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: subw a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: fold_urem_positive_odd: ; RV64IM: # %bb.0: -; RV64IM-NEXT: slli a0, a0, 32 -; RV64IM-NEXT: srli a0, a0, 32 -; RV64IM-NEXT: lui a1, 1423 -; RV64IM-NEXT: addiw a1, a1, -733 -; RV64IM-NEXT: slli a1, a1, 15 -; RV64IM-NEXT: addi a1, a1, 1035 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, -1811 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 561 -; RV64IM-NEXT: mulhu a1, a0, a1 +; RV64IM-NEXT: slli a1, a0, 32 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: lui a2, 364242 +; RV64IM-NEXT: addiw a2, a2, 777 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 32 ; RV64IM-NEXT: sub a2, a0, a1 -; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: srliw a2, a2, 1 ; RV64IM-NEXT: add a1, a2, a1 ; RV64IM-NEXT: srli a1, a1, 6 ; RV64IM-NEXT: addi a2, zero, 95 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = urem i32 %x, 95 ret i32 %1 @@ -76,8 +97,19 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lui a0, 1012964 +; RV32I-NEXT: addi a2, a0, -61 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a1, zero +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: srli a0, a1, 10 ; RV32I-NEXT: addi a1, zero, 1060 -; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret @@ -97,31 +129,37 @@ ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: lui a1, 62 +; RV64I-NEXT: addiw a1, a1, -711 +; RV64I-NEXT: slli a1, a1, 14 +; RV64I-NEXT: addi a1, a1, -61 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 42 ; RV64I-NEXT: addi a1, zero, 1060 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: subw a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: fold_urem_positive_even: ; RV64IM: # %bb.0: -; RV64IM-NEXT: slli a0, a0, 32 -; RV64IM-NEXT: srli a0, a0, 32 -; RV64IM-NEXT: lui a1, 1048020 -; RV64IM-NEXT: addiw a1, a1, -1793 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 139 -; RV64IM-NEXT: slli a1, a1, 14 -; RV64IM-NEXT: addi a1, a1, 1793 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -139 -; RV64IM-NEXT: mulhu a1, a0, a1 -; RV64IM-NEXT: srli a1, a1, 10 +; RV64IM-NEXT: slli a1, a0, 32 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: lui a2, 62 +; RV64IM-NEXT: addiw a2, a2, -711 +; RV64IM-NEXT: slli a2, a2, 14 +; RV64IM-NEXT: addi a2, a2, -61 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 42 ; RV64IM-NEXT: addi a2, zero, 1060 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = urem i32 %x, 1060 ret i32 %1 @@ -137,13 +175,21 @@ ; RV32I-NEXT: sw s0, 8(sp) ; RV32I-NEXT: sw s1, 4(sp) ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: lui a0, 364242 +; RV32I-NEXT: addi a2, a0, 777 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __udivsi3 -; RV32I-NEXT: add a0, s1, a0 +; RV32I-NEXT: mv a1, zero +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sub a0, s0, a1 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: srli s1, a0, 6 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: add a0, a0, s1 ; RV32I-NEXT: lw s1, 4(sp) ; RV32I-NEXT: lw s0, 8(sp) ; RV32I-NEXT: lw ra, 12(sp) @@ -171,16 +217,22 @@ ; RV64I-NEXT: sd ra, 24(sp) ; RV64I-NEXT: sd s0, 16(sp) ; RV64I-NEXT: sd s1, 8(sp) +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: srli s0, a0, 32 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: lui a1, 364242 +; RV64I-NEXT: addiw a1, a1, 777 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: sub a1, s0, a0 +; RV64I-NEXT: srliw a1, a1, 1 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: srli s1, a0, 6 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __udivdi3 -; RV64I-NEXT: add a0, s1, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: addw a0, a0, s1 ; RV64I-NEXT: ld s1, 8(sp) ; RV64I-NEXT: ld s0, 16(sp) ; RV64I-NEXT: ld ra, 24(sp) @@ -189,25 +241,20 @@ ; ; RV64IM-LABEL: combine_urem_udiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: slli a0, a0, 32 -; RV64IM-NEXT: srli a0, a0, 32 -; RV64IM-NEXT: lui a1, 1423 -; RV64IM-NEXT: addiw a1, a1, -733 -; RV64IM-NEXT: slli a1, a1, 15 -; RV64IM-NEXT: addi a1, a1, 1035 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, -1811 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 561 -; RV64IM-NEXT: mulhu a1, a0, a1 +; RV64IM-NEXT: slli a1, a0, 32 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: lui a2, 364242 +; RV64IM-NEXT: addiw a2, a2, 777 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 32 ; RV64IM-NEXT: sub a2, a0, a1 -; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: srliw a2, a2, 1 ; RV64IM-NEXT: add a1, a2, a1 ; RV64IM-NEXT: srli a1, a1, 6 ; RV64IM-NEXT: addi a2, zero, 95 ; RV64IM-NEXT: mul a2, a1, a2 ; RV64IM-NEXT: sub a0, a0, a2 -; RV64IM-NEXT: add a0, a0, a1 +; RV64IM-NEXT: addw a0, a0, a1 ; RV64IM-NEXT: ret %1 = urem i32 %x, 95 %2 = udiv i32 %x, 95 @@ -248,32 +295,123 @@ define i64 @dont_fold_urem_i64(i64 %x) nounwind { ; RV32I-LABEL: dont_fold_urem_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: sw ra, 60(sp) +; RV32I-NEXT: sw s0, 56(sp) +; RV32I-NEXT: sw s1, 52(sp) +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: sw zero, 24(sp) +; RV32I-NEXT: lui a0, 342392 +; RV32I-NEXT: addi a0, a0, 668 +; RV32I-NEXT: sw a0, 4(sp) +; RV32I-NEXT: lui a0, 770382 +; RV32I-NEXT: addi a0, a0, 1505 +; RV32I-NEXT: sw a0, 0(sp) +; RV32I-NEXT: srli a0, a1, 1 +; RV32I-NEXT: sw a0, 20(sp) +; RV32I-NEXT: slli a0, a1, 31 +; RV32I-NEXT: srli a1, s1, 1 +; RV32I-NEXT: or a3, a1, a0 +; RV32I-NEXT: addi a0, sp, 32 +; RV32I-NEXT: addi a1, sp, 16 +; RV32I-NEXT: mv a2, sp +; RV32I-NEXT: sw a3, 16(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: lw a1, 44(sp) +; RV32I-NEXT: lw a0, 40(sp) +; RV32I-NEXT: slli a2, a1, 28 +; RV32I-NEXT: srli a0, a0, 4 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: srli a1, a1, 4 ; RV32I-NEXT: addi a2, zero, 98 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __umoddi3 -; RV32I-NEXT: lw ra, 12(sp) -; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sltu a2, s1, a0 +; RV32I-NEXT: sub a1, s0, a1 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a0, s1, a0 +; RV32I-NEXT: lw s1, 52(sp) +; RV32I-NEXT: lw s0, 56(sp) +; RV32I-NEXT: lw ra, 60(sp) +; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: dont_fold_urem_i64: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -16 -; RV32IM-NEXT: sw ra, 12(sp) -; RV32IM-NEXT: addi a2, zero, 98 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: lw ra, 12(sp) -; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: addi sp, sp, -64 +; RV32IM-NEXT: sw ra, 60(sp) +; RV32IM-NEXT: sw s0, 56(sp) +; RV32IM-NEXT: sw s1, 52(sp) +; RV32IM-NEXT: mv s0, a1 +; RV32IM-NEXT: mv s1, a0 +; RV32IM-NEXT: sw zero, 12(sp) +; RV32IM-NEXT: sw zero, 8(sp) +; RV32IM-NEXT: sw zero, 28(sp) +; RV32IM-NEXT: sw zero, 24(sp) +; RV32IM-NEXT: lui a0, 342392 +; RV32IM-NEXT: addi a0, a0, 668 +; RV32IM-NEXT: sw a0, 4(sp) +; RV32IM-NEXT: lui a0, 770382 +; RV32IM-NEXT: addi a0, a0, 1505 +; RV32IM-NEXT: sw a0, 0(sp) +; RV32IM-NEXT: srli a0, a1, 1 +; RV32IM-NEXT: sw a0, 20(sp) +; RV32IM-NEXT: slli a0, a1, 31 +; RV32IM-NEXT: srli a1, s1, 1 +; RV32IM-NEXT: or a3, a1, a0 +; RV32IM-NEXT: addi a0, sp, 32 +; RV32IM-NEXT: addi a1, sp, 16 +; RV32IM-NEXT: mv a2, sp +; RV32IM-NEXT: sw a3, 16(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: lw a0, 44(sp) +; RV32IM-NEXT: srli a1, a0, 4 +; RV32IM-NEXT: lw a2, 40(sp) +; RV32IM-NEXT: addi a3, zero, 98 +; RV32IM-NEXT: mul a1, a1, a3 +; RV32IM-NEXT: slli a0, a0, 28 +; RV32IM-NEXT: srli a2, a2, 4 +; RV32IM-NEXT: or a0, a2, a0 +; RV32IM-NEXT: mulhu a2, a0, a3 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: sub a1, s0, a1 +; RV32IM-NEXT: mul a0, a0, a3 +; RV32IM-NEXT: sltu a2, s1, a0 +; RV32IM-NEXT: sub a1, a1, a2 +; RV32IM-NEXT: sub a0, s1, a0 +; RV32IM-NEXT: lw s1, 52(sp) +; RV32IM-NEXT: lw s0, 56(sp) +; RV32IM-NEXT: lw ra, 60(sp) +; RV32IM-NEXT: addi sp, sp, 64 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: sd s0, 0(sp) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: lui a1, 2675 +; RV64I-NEXT: addiw a1, a1, -251 +; RV64I-NEXT: slli a1, a1, 13 +; RV64I-NEXT: addi a1, a1, 1839 +; RV64I-NEXT: slli a1, a1, 13 +; RV64I-NEXT: addi a1, a1, 167 +; RV64I-NEXT: slli a1, a1, 13 +; RV64I-NEXT: addi a2, a1, 1505 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: srli a0, a1, 4 ; RV64I-NEXT: addi a1, zero, 98 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: ld s0, 0(sp) ; RV64I-NEXT: ld ra, 8(sp) ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -20,30 +20,55 @@ ; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: sw s5, 4(sp) -; RV32I-NEXT: lhu s2, 12(a1) +; RV32I-NEXT: lhu s2, 0(a1) ; RV32I-NEXT: lhu s3, 8(a1) -; RV32I-NEXT: lhu s0, 4(a1) -; RV32I-NEXT: lhu a2, 0(a1) -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: addi a1, zero, 124 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s5, a0 -; RV32I-NEXT: addi a1, zero, 98 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: lhu s4, 4(a1) +; RV32I-NEXT: lhu s1, 12(a1) ; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 1373 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: sub a1, s1, a0 +; RV32I-NEXT: lui a2, 16 +; RV32I-NEXT: addi a3, a2, -2 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: addi a1, a2, -512 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 9 ; RV32I-NEXT: addi a1, zero, 1003 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s5, s1, a0 +; RV32I-NEXT: srli a0, s4, 2 +; RV32I-NEXT: lui a1, 4 +; RV32I-NEXT: addi a1, a1, 529 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 19 +; RV32I-NEXT: addi a1, zero, 124 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s4, s4, a0 +; RV32I-NEXT: srli a0, s3, 1 +; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a1, a1, -1421 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 17 +; RV32I-NEXT: addi a1, zero, 98 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s1, s3, a0 +; RV32I-NEXT: lui a0, 11 +; RV32I-NEXT: addi a1, a0, -905 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: sh a0, 6(s1) -; RV32I-NEXT: sh s0, 4(s1) -; RV32I-NEXT: sh s5, 2(s1) -; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 22 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s2, a0 +; RV32I-NEXT: sh a0, 0(s0) +; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh s4, 2(s0) +; RV32I-NEXT: sh s5, 6(s0) ; RV32I-NEXT: lw s5, 4(sp) ; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) @@ -56,46 +81,50 @@ ; ; RV32IM-LABEL: fold_urem_vec_1: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a6, 12(a1) -; RV32IM-NEXT: lhu a3, 8(a1) -; RV32IM-NEXT: lhu a4, 0(a1) +; RV32IM-NEXT: lhu a6, 0(a1) +; RV32IM-NEXT: lhu a3, 12(a1) +; RV32IM-NEXT: lhu a7, 8(a1) ; RV32IM-NEXT: lhu a1, 4(a1) -; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 -; RV32IM-NEXT: mulhu a5, a4, a5 -; RV32IM-NEXT: sub a2, a4, a5 +; RV32IM-NEXT: addi a5, zero, 1373 +; RV32IM-NEXT: mul a5, a3, a5 +; RV32IM-NEXT: srli a5, a5, 16 +; RV32IM-NEXT: sub a2, a3, a5 +; RV32IM-NEXT: lui a4, 16 +; RV32IM-NEXT: addi a4, a4, -2 +; RV32IM-NEXT: and a2, a2, a4 ; RV32IM-NEXT: srli a2, a2, 1 ; RV32IM-NEXT: add a2, a2, a5 -; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: srli a2, a2, 9 +; RV32IM-NEXT: addi a4, zero, 1003 +; RV32IM-NEXT: mul a2, a2, a4 +; RV32IM-NEXT: sub a2, a3, a2 +; RV32IM-NEXT: srli a3, a1, 2 +; RV32IM-NEXT: lui a4, 4 +; RV32IM-NEXT: addi a4, a4, 529 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: srli a3, a3, 19 +; RV32IM-NEXT: addi a4, zero, 124 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: sub a1, a1, a3 +; RV32IM-NEXT: srli a3, a7, 1 +; RV32IM-NEXT: lui a4, 1 +; RV32IM-NEXT: addi a4, a4, -1421 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: srli a3, a3, 17 +; RV32IM-NEXT: addi a4, zero, 98 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: sub a3, a7, a3 +; RV32IM-NEXT: lui a4, 11 +; RV32IM-NEXT: addi a4, a4, -905 +; RV32IM-NEXT: mul a4, a6, a4 +; RV32IM-NEXT: srli a4, a4, 22 ; RV32IM-NEXT: addi a5, zero, 95 -; RV32IM-NEXT: mul a2, a2, a5 -; RV32IM-NEXT: sub a2, a4, a2 -; RV32IM-NEXT: srli a4, a1, 2 -; RV32IM-NEXT: lui a5, 135300 -; RV32IM-NEXT: addi a5, a5, 529 -; RV32IM-NEXT: mulhu a4, a4, a5 -; RV32IM-NEXT: srli a4, a4, 2 -; RV32IM-NEXT: addi a5, zero, 124 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 -; RV32IM-NEXT: lui a4, 342392 -; RV32IM-NEXT: addi a4, a4, 669 -; RV32IM-NEXT: mulhu a4, a3, a4 -; RV32IM-NEXT: srli a4, a4, 5 -; RV32IM-NEXT: addi a5, zero, 98 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: lui a4, 267633 -; RV32IM-NEXT: addi a4, a4, -1809 -; RV32IM-NEXT: mulhu a4, a6, a4 -; RV32IM-NEXT: srli a4, a4, 8 -; RV32IM-NEXT: addi a5, zero, 1003 ; RV32IM-NEXT: mul a4, a4, a5 ; RV32IM-NEXT: sub a4, a6, a4 -; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a4, 0(a0) ; RV32IM-NEXT: sh a3, 4(a0) ; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sh a2, 6(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_urem_vec_1: @@ -108,30 +137,55 @@ ; RV64I-NEXT: sd s3, 24(sp) ; RV64I-NEXT: sd s4, 16(sp) ; RV64I-NEXT: sd s5, 8(sp) -; RV64I-NEXT: lhu s2, 24(a1) +; RV64I-NEXT: lhu s2, 0(a1) ; RV64I-NEXT: lhu s3, 16(a1) -; RV64I-NEXT: lhu s0, 8(a1) -; RV64I-NEXT: lhu a2, 0(a1) -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s4, a0 -; RV64I-NEXT: addi a1, zero, 124 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s5, a0 -; RV64I-NEXT: addi a1, zero, 98 -; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: lhu s4, 8(a1) +; RV64I-NEXT: lhu s1, 24(a1) ; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 1373 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: sub a1, s1, a0 +; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: addiw a3, a2, -2 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: srli a1, a1, 1 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: addiw a1, a2, -512 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: srli a0, a0, 9 ; RV64I-NEXT: addi a1, zero, 1003 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s5, s1, a0 +; RV64I-NEXT: srli a0, s4, 2 +; RV64I-NEXT: lui a1, 4 +; RV64I-NEXT: addiw a1, a1, 529 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 19 +; RV64I-NEXT: addi a1, zero, 124 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s4, a0 +; RV64I-NEXT: srli a0, s3, 1 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, -1421 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 17 +; RV64I-NEXT: addi a1, zero, 98 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s1, s3, a0 +; RV64I-NEXT: lui a0, 11 +; RV64I-NEXT: addiw a1, a0, -905 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: sh a0, 6(s1) -; RV64I-NEXT: sh s0, 4(s1) -; RV64I-NEXT: sh s5, 2(s1) -; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 22 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s2, a0 +; RV64I-NEXT: sh a0, 0(s0) +; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh s4, 2(s0) +; RV64I-NEXT: sh s5, 6(s0) ; RV64I-NEXT: ld s5, 8(sp) ; RV64I-NEXT: ld s4, 16(sp) ; RV64I-NEXT: ld s3, 24(sp) @@ -144,71 +198,50 @@ ; ; RV64IM-LABEL: fold_urem_vec_1: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a6, 24(a1) -; RV64IM-NEXT: lhu a3, 16(a1) -; RV64IM-NEXT: lhu a4, 8(a1) -; RV64IM-NEXT: lhu a1, 0(a1) -; RV64IM-NEXT: lui a5, 1423 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, -1811 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 561 -; RV64IM-NEXT: mulhu a5, a1, a5 -; RV64IM-NEXT: sub a2, a1, a5 +; RV64IM-NEXT: lhu a6, 0(a1) +; RV64IM-NEXT: lhu a3, 24(a1) +; RV64IM-NEXT: lhu a7, 16(a1) +; RV64IM-NEXT: lhu a1, 8(a1) +; RV64IM-NEXT: addi a5, zero, 1373 +; RV64IM-NEXT: mul a5, a3, a5 +; RV64IM-NEXT: srli a5, a5, 16 +; RV64IM-NEXT: sub a2, a3, a5 +; RV64IM-NEXT: lui a4, 16 +; RV64IM-NEXT: addiw a4, a4, -2 +; RV64IM-NEXT: and a2, a2, a4 ; RV64IM-NEXT: srli a2, a2, 1 ; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: srli a2, a2, 9 +; RV64IM-NEXT: addi a4, zero, 1003 +; RV64IM-NEXT: mul a2, a2, a4 +; RV64IM-NEXT: sub a2, a3, a2 +; RV64IM-NEXT: srli a3, a1, 2 +; RV64IM-NEXT: lui a4, 4 +; RV64IM-NEXT: addiw a4, a4, 529 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: srli a3, a3, 19 +; RV64IM-NEXT: addi a4, zero, 124 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: sub a1, a1, a3 +; RV64IM-NEXT: srli a3, a7, 1 +; RV64IM-NEXT: lui a4, 1 +; RV64IM-NEXT: addiw a4, a4, -1421 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: srli a3, a3, 17 +; RV64IM-NEXT: addi a4, zero, 98 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: sub a3, a7, a3 +; RV64IM-NEXT: lui a4, 11 +; RV64IM-NEXT: addiw a4, a4, -905 +; RV64IM-NEXT: mul a4, a6, a4 +; RV64IM-NEXT: srli a4, a4, 22 ; RV64IM-NEXT: addi a5, zero, 95 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a1, a1, a2 -; RV64IM-NEXT: srli a2, a4, 2 -; RV64IM-NEXT: lui a5, 264 -; RV64IM-NEXT: addiw a5, a5, 1057 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1057 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1057 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 133 -; RV64IM-NEXT: mulhu a2, a2, a5 -; RV64IM-NEXT: srli a2, a2, 3 -; RV64IM-NEXT: addi a5, zero, 124 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a2, a4, a2 -; RV64IM-NEXT: srli a4, a3, 1 -; RV64IM-NEXT: lui a5, 2675 -; RV64IM-NEXT: addiw a5, a5, -251 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, 1839 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, 167 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, 1505 -; RV64IM-NEXT: mulhu a4, a4, a5 -; RV64IM-NEXT: srli a4, a4, 4 -; RV64IM-NEXT: addi a5, zero, 98 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a3, a3, a4 -; RV64IM-NEXT: lui a4, 8364 -; RV64IM-NEXT: addiw a4, a4, -1977 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 1907 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 453 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 1213 -; RV64IM-NEXT: mulhu a4, a6, a4 -; RV64IM-NEXT: srli a4, a4, 7 -; RV64IM-NEXT: addi a5, zero, 1003 ; RV64IM-NEXT: mul a4, a4, a5 ; RV64IM-NEXT: sub a4, a6, a4 -; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: sh a4, 0(a0) ; RV64IM-NEXT: sh a3, 4(a0) -; RV64IM-NEXT: sh a2, 2(a0) -; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: sh a1, 2(a0) +; RV64IM-NEXT: sh a2, 6(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -225,30 +258,47 @@ ; RV32I-NEXT: sw s3, 12(sp) ; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: sw s5, 4(sp) +; RV32I-NEXT: sw s6, 0(sp) ; RV32I-NEXT: lhu s2, 12(a1) ; RV32I-NEXT: lhu s3, 8(a1) -; RV32I-NEXT: lhu s0, 4(a1) -; RV32I-NEXT: lhu a2, 0(a1) -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: lhu s4, 4(a1) +; RV32I-NEXT: lhu s1, 0(a1) ; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: lui a0, 11 +; RV32I-NEXT: addi s0, a0, -905 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 22 ; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s6, s1, a0 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 22 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s4, s4, a0 ; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 22 ; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s1, s3, a0 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: sh a0, 6(s1) -; RV32I-NEXT: sh s0, 4(s1) -; RV32I-NEXT: sh s5, 2(s1) -; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 22 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s2, a0 +; RV32I-NEXT: sh a0, 6(s5) +; RV32I-NEXT: sh s1, 4(s5) +; RV32I-NEXT: sh s4, 2(s5) +; RV32I-NEXT: sh s6, 0(s5) +; RV32I-NEXT: lw s6, 0(sp) ; RV32I-NEXT: lw s5, 4(sp) ; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) @@ -262,44 +312,32 @@ ; RV32IM-LABEL: fold_urem_vec_2: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lhu a6, 12(a1) -; RV32IM-NEXT: lhu a7, 8(a1) +; RV32IM-NEXT: lhu a3, 8(a1) ; RV32IM-NEXT: lhu a4, 0(a1) ; RV32IM-NEXT: lhu a1, 4(a1) -; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 -; RV32IM-NEXT: mulhu a2, a4, a5 -; RV32IM-NEXT: sub a3, a4, a2 -; RV32IM-NEXT: srli a3, a3, 1 -; RV32IM-NEXT: add a2, a3, a2 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: addi a3, zero, 95 -; RV32IM-NEXT: mul a2, a2, a3 -; RV32IM-NEXT: sub t0, a4, a2 -; RV32IM-NEXT: mulhu a4, a1, a5 -; RV32IM-NEXT: sub a2, a1, a4 -; RV32IM-NEXT: srli a2, a2, 1 -; RV32IM-NEXT: add a2, a2, a4 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: mul a2, a2, a3 -; RV32IM-NEXT: sub a1, a1, a2 -; RV32IM-NEXT: mulhu a2, a7, a5 -; RV32IM-NEXT: sub a4, a7, a2 -; RV32IM-NEXT: srli a4, a4, 1 -; RV32IM-NEXT: add a2, a4, a2 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: mul a2, a2, a3 -; RV32IM-NEXT: sub a2, a7, a2 -; RV32IM-NEXT: mulhu a4, a6, a5 -; RV32IM-NEXT: sub a5, a6, a4 -; RV32IM-NEXT: srli a5, a5, 1 -; RV32IM-NEXT: add a4, a5, a4 -; RV32IM-NEXT: srli a4, a4, 6 -; RV32IM-NEXT: mul a3, a4, a3 -; RV32IM-NEXT: sub a3, a6, a3 -; RV32IM-NEXT: sh a3, 6(a0) -; RV32IM-NEXT: sh a2, 4(a0) +; RV32IM-NEXT: lui a5, 11 +; RV32IM-NEXT: addi a5, a5, -905 +; RV32IM-NEXT: mul a2, a4, a5 +; RV32IM-NEXT: srli a2, a2, 22 +; RV32IM-NEXT: addi a7, zero, 95 +; RV32IM-NEXT: mul a2, a2, a7 +; RV32IM-NEXT: sub a2, a4, a2 +; RV32IM-NEXT: mul a4, a1, a5 +; RV32IM-NEXT: srli a4, a4, 22 +; RV32IM-NEXT: mul a4, a4, a7 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: mul a4, a3, a5 +; RV32IM-NEXT: srli a4, a4, 22 +; RV32IM-NEXT: mul a4, a4, a7 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: mul a4, a6, a5 +; RV32IM-NEXT: srli a4, a4, 22 +; RV32IM-NEXT: mul a4, a4, a7 +; RV32IM-NEXT: sub a4, a6, a4 +; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) ; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh t0, 0(a0) +; RV32IM-NEXT: sh a2, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: fold_urem_vec_2: @@ -312,30 +350,47 @@ ; RV64I-NEXT: sd s3, 24(sp) ; RV64I-NEXT: sd s4, 16(sp) ; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: sd s6, 0(sp) ; RV64I-NEXT: lhu s2, 24(a1) ; RV64I-NEXT: lhu s3, 16(a1) -; RV64I-NEXT: lhu s0, 8(a1) -; RV64I-NEXT: lhu a2, 0(a1) -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s4, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: lhu s4, 8(a1) +; RV64I-NEXT: lhu s1, 0(a1) ; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: lui a0, 11 +; RV64I-NEXT: addiw s0, a0, -905 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 22 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s6, s1, a0 +; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 22 ; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s4, a0 ; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 22 ; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s1, s3, a0 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: sh a0, 6(s1) -; RV64I-NEXT: sh s0, 4(s1) -; RV64I-NEXT: sh s5, 2(s1) -; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 22 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s2, a0 +; RV64I-NEXT: sh a0, 6(s5) +; RV64I-NEXT: sh s1, 4(s5) +; RV64I-NEXT: sh s4, 2(s5) +; RV64I-NEXT: sh s6, 0(s5) +; RV64I-NEXT: ld s6, 0(sp) ; RV64I-NEXT: ld s5, 8(sp) ; RV64I-NEXT: ld s4, 16(sp) ; RV64I-NEXT: ld s3, 24(sp) @@ -349,50 +404,32 @@ ; RV64IM-LABEL: fold_urem_vec_2: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lhu a6, 24(a1) -; RV64IM-NEXT: lhu a7, 16(a1) -; RV64IM-NEXT: lhu a4, 8(a1) -; RV64IM-NEXT: lhu a1, 0(a1) -; RV64IM-NEXT: lui a5, 1423 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, -1811 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 561 -; RV64IM-NEXT: mulhu a2, a1, a5 -; RV64IM-NEXT: sub a3, a1, a2 -; RV64IM-NEXT: srli a3, a3, 1 -; RV64IM-NEXT: add a2, a3, a2 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: addi a3, zero, 95 -; RV64IM-NEXT: mul a2, a2, a3 -; RV64IM-NEXT: sub t0, a1, a2 -; RV64IM-NEXT: mulhu a2, a4, a5 -; RV64IM-NEXT: sub a1, a4, a2 -; RV64IM-NEXT: srli a1, a1, 1 -; RV64IM-NEXT: add a1, a1, a2 -; RV64IM-NEXT: srli a1, a1, 6 -; RV64IM-NEXT: mul a1, a1, a3 -; RV64IM-NEXT: sub a1, a4, a1 -; RV64IM-NEXT: mulhu a2, a7, a5 -; RV64IM-NEXT: sub a4, a7, a2 -; RV64IM-NEXT: srli a4, a4, 1 -; RV64IM-NEXT: add a2, a4, a2 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: mul a2, a2, a3 -; RV64IM-NEXT: sub a2, a7, a2 -; RV64IM-NEXT: mulhu a4, a6, a5 -; RV64IM-NEXT: sub a5, a6, a4 -; RV64IM-NEXT: srli a5, a5, 1 -; RV64IM-NEXT: add a4, a5, a4 -; RV64IM-NEXT: srli a4, a4, 6 -; RV64IM-NEXT: mul a3, a4, a3 -; RV64IM-NEXT: sub a3, a6, a3 -; RV64IM-NEXT: sh a3, 6(a0) -; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: lhu a3, 16(a1) +; RV64IM-NEXT: lhu a4, 0(a1) +; RV64IM-NEXT: lhu a1, 8(a1) +; RV64IM-NEXT: lui a5, 11 +; RV64IM-NEXT: addiw a5, a5, -905 +; RV64IM-NEXT: mul a2, a4, a5 +; RV64IM-NEXT: srli a2, a2, 22 +; RV64IM-NEXT: addi a7, zero, 95 +; RV64IM-NEXT: mul a2, a2, a7 +; RV64IM-NEXT: sub a2, a4, a2 +; RV64IM-NEXT: mul a4, a1, a5 +; RV64IM-NEXT: srli a4, a4, 22 +; RV64IM-NEXT: mul a4, a4, a7 +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: mul a4, a3, a5 +; RV64IM-NEXT: srli a4, a4, 22 +; RV64IM-NEXT: mul a4, a4, a7 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: mul a4, a6, a5 +; RV64IM-NEXT: srli a4, a4, 22 +; RV64IM-NEXT: mul a4, a4, a7 +; RV64IM-NEXT: sub a4, a6, a4 +; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: sh a3, 4(a0) ; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh t0, 0(a0) +; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -414,52 +451,53 @@ ; RV32I-NEXT: sw s6, 16(sp) ; RV32I-NEXT: sw s7, 12(sp) ; RV32I-NEXT: sw s8, 8(sp) -; RV32I-NEXT: sw s9, 4(sp) ; RV32I-NEXT: lhu s2, 0(a1) ; RV32I-NEXT: lhu s3, 4(a1) ; RV32I-NEXT: lhu s4, 8(a1) ; RV32I-NEXT: lhu s1, 12(a1) -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: call __umodsi3 ; RV32I-NEXT: mv s5, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s6, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s7, a0 -; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s8, a0 -; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: lui a0, 11 +; RV32I-NEXT: addi s0, a0, -905 ; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: call __udivsi3 -; RV32I-NEXT: mv s9, a0 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli s6, a0, 22 ; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s6 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s8, s1, a0 ; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: call __udivsi3 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli s7, a0, 22 ; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s7 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s4, s4, a0 ; RV32I-NEXT: mv a0, s3 -; RV32I-NEXT: call __udivsi3 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli s1, a0, 22 ; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s3, s3, a0 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __udivsi3 -; RV32I-NEXT: add a0, s8, a0 -; RV32I-NEXT: add a1, s7, s1 -; RV32I-NEXT: add a2, s6, s4 -; RV32I-NEXT: add a3, s5, s9 -; RV32I-NEXT: sh a3, 6(s0) -; RV32I-NEXT: sh a2, 4(s0) -; RV32I-NEXT: sh a1, 2(s0) -; RV32I-NEXT: sh a0, 0(s0) -; RV32I-NEXT: lw s9, 4(sp) +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli s0, a0, 22 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s2, a0 +; RV32I-NEXT: add a0, a0, s0 +; RV32I-NEXT: add a1, s3, s1 +; RV32I-NEXT: add a2, s4, s7 +; RV32I-NEXT: add a3, s8, s6 +; RV32I-NEXT: sh a3, 6(s5) +; RV32I-NEXT: sh a2, 4(s5) +; RV32I-NEXT: sh a1, 2(s5) +; RV32I-NEXT: sh a0, 0(s5) ; RV32I-NEXT: lw s8, 8(sp) ; RV32I-NEXT: lw s7, 12(sp) ; RV32I-NEXT: lw s6, 16(sp) @@ -476,173 +514,144 @@ ; RV32IM-LABEL: combine_urem_udiv: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lhu a6, 0(a1) -; RV32IM-NEXT: lhu a7, 4(a1) +; RV32IM-NEXT: lhu a3, 4(a1) ; RV32IM-NEXT: lhu a4, 12(a1) ; RV32IM-NEXT: lhu a1, 8(a1) -; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 -; RV32IM-NEXT: mulhu a2, a4, a5 -; RV32IM-NEXT: sub a3, a4, a2 -; RV32IM-NEXT: srli a3, a3, 1 -; RV32IM-NEXT: add a2, a3, a2 -; RV32IM-NEXT: srli t3, a2, 6 -; RV32IM-NEXT: addi t0, zero, 95 -; RV32IM-NEXT: mul a3, t3, t0 -; RV32IM-NEXT: sub t1, a4, a3 -; RV32IM-NEXT: mulhu a4, a1, a5 -; RV32IM-NEXT: sub a3, a1, a4 -; RV32IM-NEXT: srli a3, a3, 1 -; RV32IM-NEXT: add a3, a3, a4 -; RV32IM-NEXT: srli a3, a3, 6 -; RV32IM-NEXT: mul a4, a3, t0 -; RV32IM-NEXT: sub t2, a1, a4 -; RV32IM-NEXT: mulhu a4, a7, a5 -; RV32IM-NEXT: sub a1, a7, a4 -; RV32IM-NEXT: srli a1, a1, 1 -; RV32IM-NEXT: add a1, a1, a4 -; RV32IM-NEXT: srli a1, a1, 6 -; RV32IM-NEXT: mul a4, a1, t0 -; RV32IM-NEXT: sub a4, a7, a4 -; RV32IM-NEXT: mulhu a5, a6, a5 -; RV32IM-NEXT: sub a2, a6, a5 -; RV32IM-NEXT: srli a2, a2, 1 -; RV32IM-NEXT: add a2, a2, a5 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: mul a5, a2, t0 +; RV32IM-NEXT: lui a5, 11 +; RV32IM-NEXT: addi a5, a5, -905 +; RV32IM-NEXT: mul a2, a4, a5 +; RV32IM-NEXT: srli t0, a2, 22 +; RV32IM-NEXT: addi a7, zero, 95 +; RV32IM-NEXT: mul a2, t0, a7 +; RV32IM-NEXT: sub t1, a4, a2 +; RV32IM-NEXT: mul a4, a1, a5 +; RV32IM-NEXT: srli a4, a4, 22 +; RV32IM-NEXT: mul a2, a4, a7 +; RV32IM-NEXT: sub t2, a1, a2 +; RV32IM-NEXT: mul a2, a3, a5 +; RV32IM-NEXT: srli a2, a2, 22 +; RV32IM-NEXT: mul a1, a2, a7 +; RV32IM-NEXT: sub a1, a3, a1 +; RV32IM-NEXT: mul a3, a6, a5 +; RV32IM-NEXT: srli a3, a3, 22 +; RV32IM-NEXT: mul a5, a3, a7 ; RV32IM-NEXT: sub a5, a6, a5 -; RV32IM-NEXT: add a2, a5, a2 -; RV32IM-NEXT: add a1, a4, a1 -; RV32IM-NEXT: add a3, t2, a3 -; RV32IM-NEXT: add a4, t1, t3 +; RV32IM-NEXT: add a3, a5, a3 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: add a2, t2, a4 +; RV32IM-NEXT: add a4, t1, t0 ; RV32IM-NEXT: sh a4, 6(a0) -; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a2, 4(a0) ; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sh a3, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: combine_urem_udiv: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -96 -; RV64I-NEXT: sd ra, 88(sp) -; RV64I-NEXT: sd s0, 80(sp) -; RV64I-NEXT: sd s1, 72(sp) -; RV64I-NEXT: sd s2, 64(sp) -; RV64I-NEXT: sd s3, 56(sp) -; RV64I-NEXT: sd s4, 48(sp) -; RV64I-NEXT: sd s5, 40(sp) -; RV64I-NEXT: sd s6, 32(sp) -; RV64I-NEXT: sd s7, 24(sp) -; RV64I-NEXT: sd s8, 16(sp) -; RV64I-NEXT: sd s9, 8(sp) +; RV64I-NEXT: addi sp, sp, -80 +; RV64I-NEXT: sd ra, 72(sp) +; RV64I-NEXT: sd s0, 64(sp) +; RV64I-NEXT: sd s1, 56(sp) +; RV64I-NEXT: sd s2, 48(sp) +; RV64I-NEXT: sd s3, 40(sp) +; RV64I-NEXT: sd s4, 32(sp) +; RV64I-NEXT: sd s5, 24(sp) +; RV64I-NEXT: sd s6, 16(sp) +; RV64I-NEXT: sd s7, 8(sp) +; RV64I-NEXT: sd s8, 0(sp) ; RV64I-NEXT: lhu s2, 0(a1) ; RV64I-NEXT: lhu s3, 8(a1) ; RV64I-NEXT: lhu s4, 16(a1) ; RV64I-NEXT: lhu s1, 24(a1) -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: mv s5, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s6, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s7, a0 -; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s8, a0 -; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: lui a0, 11 +; RV64I-NEXT: addiw s0, a0, -905 ; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __udivdi3 -; RV64I-NEXT: mv s9, a0 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli s6, a0, 22 ; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s6 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s8, s1, a0 ; RV64I-NEXT: mv a0, s4 -; RV64I-NEXT: call __udivdi3 -; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli s7, a0, 22 ; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s7 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s4, a0 ; RV64I-NEXT: mv a0, s3 -; RV64I-NEXT: call __udivdi3 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli s1, a0, 22 ; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s3, s3, a0 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __udivdi3 -; RV64I-NEXT: add a0, s8, a0 -; RV64I-NEXT: add a1, s7, s1 -; RV64I-NEXT: add a2, s6, s4 -; RV64I-NEXT: add a3, s5, s9 -; RV64I-NEXT: sh a3, 6(s0) -; RV64I-NEXT: sh a2, 4(s0) -; RV64I-NEXT: sh a1, 2(s0) -; RV64I-NEXT: sh a0, 0(s0) -; RV64I-NEXT: ld s9, 8(sp) -; RV64I-NEXT: ld s8, 16(sp) -; RV64I-NEXT: ld s7, 24(sp) -; RV64I-NEXT: ld s6, 32(sp) -; RV64I-NEXT: ld s5, 40(sp) -; RV64I-NEXT: ld s4, 48(sp) -; RV64I-NEXT: ld s3, 56(sp) -; RV64I-NEXT: ld s2, 64(sp) -; RV64I-NEXT: ld s1, 72(sp) -; RV64I-NEXT: ld s0, 80(sp) -; RV64I-NEXT: ld ra, 88(sp) -; RV64I-NEXT: addi sp, sp, 96 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli s0, a0, 22 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s2, a0 +; RV64I-NEXT: add a0, a0, s0 +; RV64I-NEXT: add a1, s3, s1 +; RV64I-NEXT: add a2, s4, s7 +; RV64I-NEXT: add a3, s8, s6 +; RV64I-NEXT: sh a3, 6(s5) +; RV64I-NEXT: sh a2, 4(s5) +; RV64I-NEXT: sh a1, 2(s5) +; RV64I-NEXT: sh a0, 0(s5) +; RV64I-NEXT: ld s8, 0(sp) +; RV64I-NEXT: ld s7, 8(sp) +; RV64I-NEXT: ld s6, 16(sp) +; RV64I-NEXT: ld s5, 24(sp) +; RV64I-NEXT: ld s4, 32(sp) +; RV64I-NEXT: ld s3, 40(sp) +; RV64I-NEXT: ld s2, 48(sp) +; RV64I-NEXT: ld s1, 56(sp) +; RV64I-NEXT: ld s0, 64(sp) +; RV64I-NEXT: ld ra, 72(sp) +; RV64I-NEXT: addi sp, sp, 80 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: combine_urem_udiv: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lhu a6, 0(a1) -; RV64IM-NEXT: lhu a7, 8(a1) -; RV64IM-NEXT: lhu a4, 16(a1) -; RV64IM-NEXT: lhu a1, 24(a1) -; RV64IM-NEXT: lui a5, 1423 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, -1811 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 561 -; RV64IM-NEXT: mulhu a2, a1, a5 -; RV64IM-NEXT: sub a3, a1, a2 -; RV64IM-NEXT: srli a3, a3, 1 -; RV64IM-NEXT: add a2, a3, a2 -; RV64IM-NEXT: srli t3, a2, 6 -; RV64IM-NEXT: addi t0, zero, 95 -; RV64IM-NEXT: mul a3, t3, t0 -; RV64IM-NEXT: sub t1, a1, a3 -; RV64IM-NEXT: mulhu a3, a4, a5 -; RV64IM-NEXT: sub a1, a4, a3 -; RV64IM-NEXT: srli a1, a1, 1 -; RV64IM-NEXT: add a1, a1, a3 -; RV64IM-NEXT: srli a1, a1, 6 -; RV64IM-NEXT: mul a3, a1, t0 -; RV64IM-NEXT: sub t2, a4, a3 -; RV64IM-NEXT: mulhu a4, a7, a5 -; RV64IM-NEXT: sub a3, a7, a4 -; RV64IM-NEXT: srli a3, a3, 1 -; RV64IM-NEXT: add a3, a3, a4 -; RV64IM-NEXT: srli a3, a3, 6 -; RV64IM-NEXT: mul a4, a3, t0 -; RV64IM-NEXT: sub a4, a7, a4 -; RV64IM-NEXT: mulhu a5, a6, a5 -; RV64IM-NEXT: sub a2, a6, a5 -; RV64IM-NEXT: srli a2, a2, 1 -; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: mul a5, a2, t0 +; RV64IM-NEXT: lhu a3, 8(a1) +; RV64IM-NEXT: lhu a4, 24(a1) +; RV64IM-NEXT: lhu a1, 16(a1) +; RV64IM-NEXT: lui a5, 11 +; RV64IM-NEXT: addiw a5, a5, -905 +; RV64IM-NEXT: mul a2, a4, a5 +; RV64IM-NEXT: srli t0, a2, 22 +; RV64IM-NEXT: addi a7, zero, 95 +; RV64IM-NEXT: mul a2, t0, a7 +; RV64IM-NEXT: sub t1, a4, a2 +; RV64IM-NEXT: mul a4, a1, a5 +; RV64IM-NEXT: srli a4, a4, 22 +; RV64IM-NEXT: mul a2, a4, a7 +; RV64IM-NEXT: sub t2, a1, a2 +; RV64IM-NEXT: mul a2, a3, a5 +; RV64IM-NEXT: srli a2, a2, 22 +; RV64IM-NEXT: mul a1, a2, a7 +; RV64IM-NEXT: sub a1, a3, a1 +; RV64IM-NEXT: mul a3, a6, a5 +; RV64IM-NEXT: srli a3, a3, 22 +; RV64IM-NEXT: mul a5, a3, a7 ; RV64IM-NEXT: sub a5, a6, a5 -; RV64IM-NEXT: add a2, a5, a2 -; RV64IM-NEXT: add a3, a4, a3 -; RV64IM-NEXT: add a1, t2, a1 -; RV64IM-NEXT: add a4, t1, t3 +; RV64IM-NEXT: add a3, a5, a3 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: add a2, t2, a4 +; RV64IM-NEXT: add a4, t1, t0 ; RV64IM-NEXT: sh a4, 6(a0) -; RV64IM-NEXT: sh a1, 4(a0) -; RV64IM-NEXT: sh a3, 2(a0) -; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a1, 2(a0) +; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, @@ -660,21 +669,28 @@ ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: lhu s2, 8(a1) ; RV32I-NEXT: lhu s3, 4(a1) -; RV32I-NEXT: lhu s1, 0(a1) -; RV32I-NEXT: lhu a2, 12(a1) -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu s4, 0(a1) +; RV32I-NEXT: lhu s0, 12(a1) +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lui a0, 11 +; RV32I-NEXT: addi a1, a0, -905 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 22 ; RV32I-NEXT: addi a1, zero, 95 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: andi a1, s1, 63 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: andi a1, s4, 63 ; RV32I-NEXT: andi a2, s3, 31 ; RV32I-NEXT: andi a3, s2, 7 -; RV32I-NEXT: sh a0, 6(s0) -; RV32I-NEXT: sh a3, 4(s0) -; RV32I-NEXT: sh a2, 2(s0) -; RV32I-NEXT: sh a1, 0(s0) +; RV32I-NEXT: sh a3, 4(s1) +; RV32I-NEXT: sh a2, 2(s1) +; RV32I-NEXT: sh a1, 0(s1) +; RV32I-NEXT: sh a0, 6(s1) +; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -689,15 +705,12 @@ ; RV32IM-NEXT: lhu a3, 4(a1) ; RV32IM-NEXT: lhu a4, 12(a1) ; RV32IM-NEXT: lhu a1, 0(a1) -; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 -; RV32IM-NEXT: mulhu a5, a4, a5 -; RV32IM-NEXT: sub a2, a4, a5 -; RV32IM-NEXT: srli a2, a2, 1 -; RV32IM-NEXT: add a2, a2, a5 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: addi a5, zero, 95 -; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: lui a5, 11 +; RV32IM-NEXT: addi a5, a5, -905 +; RV32IM-NEXT: mul a5, a4, a5 +; RV32IM-NEXT: srli a5, a5, 22 +; RV32IM-NEXT: addi a2, zero, 95 +; RV32IM-NEXT: mul a2, a5, a2 ; RV32IM-NEXT: sub a2, a4, a2 ; RV32IM-NEXT: andi a1, a1, 63 ; RV32IM-NEXT: andi a3, a3, 31 @@ -716,21 +729,28 @@ ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) ; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: sd s4, 0(sp) ; RV64I-NEXT: lhu s2, 16(a1) ; RV64I-NEXT: lhu s3, 8(a1) -; RV64I-NEXT: lhu s1, 0(a1) -; RV64I-NEXT: lhu a2, 24(a1) -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu s4, 0(a1) +; RV64I-NEXT: lhu s0, 24(a1) +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 11 +; RV64I-NEXT: addiw a1, a0, -905 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 22 ; RV64I-NEXT: addi a1, zero, 95 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: andi a1, s1, 63 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s0, a0 +; RV64I-NEXT: andi a1, s4, 63 ; RV64I-NEXT: andi a2, s3, 31 ; RV64I-NEXT: andi a3, s2, 7 -; RV64I-NEXT: sh a0, 6(s0) -; RV64I-NEXT: sh a3, 4(s0) -; RV64I-NEXT: sh a2, 2(s0) -; RV64I-NEXT: sh a1, 0(s0) +; RV64I-NEXT: sh a3, 4(s1) +; RV64I-NEXT: sh a2, 2(s1) +; RV64I-NEXT: sh a1, 0(s1) +; RV64I-NEXT: sh a0, 6(s1) +; RV64I-NEXT: ld s4, 0(sp) ; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) @@ -743,31 +763,22 @@ ; RV64IM: # %bb.0: ; RV64IM-NEXT: lhu a6, 16(a1) ; RV64IM-NEXT: lhu a3, 8(a1) -; RV64IM-NEXT: lhu a4, 0(a1) -; RV64IM-NEXT: lhu a1, 24(a1) -; RV64IM-NEXT: lui a5, 1423 -; RV64IM-NEXT: addiw a5, a5, -733 -; RV64IM-NEXT: slli a5, a5, 15 -; RV64IM-NEXT: addi a5, a5, 1035 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, -1811 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 561 -; RV64IM-NEXT: mulhu a5, a1, a5 -; RV64IM-NEXT: sub a2, a1, a5 -; RV64IM-NEXT: srli a2, a2, 1 -; RV64IM-NEXT: add a2, a2, a5 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: addi a5, zero, 95 -; RV64IM-NEXT: mul a2, a2, a5 -; RV64IM-NEXT: sub a1, a1, a2 -; RV64IM-NEXT: andi a2, a4, 63 +; RV64IM-NEXT: lhu a4, 24(a1) +; RV64IM-NEXT: lhu a1, 0(a1) +; RV64IM-NEXT: lui a5, 11 +; RV64IM-NEXT: addiw a5, a5, -905 +; RV64IM-NEXT: mul a5, a4, a5 +; RV64IM-NEXT: srli a5, a5, 22 +; RV64IM-NEXT: addi a2, zero, 95 +; RV64IM-NEXT: mul a2, a5, a2 +; RV64IM-NEXT: sub a2, a4, a2 +; RV64IM-NEXT: andi a1, a1, 63 ; RV64IM-NEXT: andi a3, a3, 31 ; RV64IM-NEXT: andi a4, a6, 7 ; RV64IM-NEXT: sh a4, 4(a0) ; RV64IM-NEXT: sh a3, 2(a0) -; RV64IM-NEXT: sh a2, 0(a0) -; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: sh a2, 6(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -783,26 +794,50 @@ ; RV32I-NEXT: sw s1, 20(sp) ; RV32I-NEXT: sw s2, 16(sp) ; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: sw s4, 8(sp) ; RV32I-NEXT: lhu s2, 12(a1) +; RV32I-NEXT: lhu s3, 4(a1) ; RV32I-NEXT: lhu s1, 8(a1) -; RV32I-NEXT: lhu a2, 4(a1) ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, zero, 654 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s3, a0 -; RV32I-NEXT: addi a1, zero, 23 +; RV32I-NEXT: lui a0, 6 +; RV32I-NEXT: addi a1, a0, 1069 ; RV32I-NEXT: mv a0, s1 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a1, a0, 1327 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 16 +; RV32I-NEXT: sub a1, s1, a0 +; RV32I-NEXT: lui a2, 16 +; RV32I-NEXT: addi a3, a2, -2 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: addi a1, a2, -16 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 4 +; RV32I-NEXT: addi a1, zero, 23 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s4, s1, a0 +; RV32I-NEXT: lui a0, 13 +; RV32I-NEXT: addi a1, a0, -1941 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 25 +; RV32I-NEXT: addi a1, zero, 654 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub s1, s3, a0 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a1, a0, 87 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: srli a0, a0, 26 +; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: addi a1, a1, 1327 +; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: sub a0, s2, a0 ; RV32I-NEXT: sh zero, 0(s0) ; RV32I-NEXT: sh a0, 6(s0) -; RV32I-NEXT: sh s1, 4(s0) -; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: sh s1, 2(s0) +; RV32I-NEXT: sh s4, 4(s0) +; RV32I-NEXT: lw s4, 8(sp) ; RV32I-NEXT: lw s3, 12(sp) ; RV32I-NEXT: lw s2, 16(sp) ; RV32I-NEXT: lw s1, 20(sp) @@ -813,36 +848,42 @@ ; ; RV32IM-LABEL: dont_fold_urem_one: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a2, 4(a1) -; RV32IM-NEXT: lhu a3, 12(a1) -; RV32IM-NEXT: lhu a1, 8(a1) -; RV32IM-NEXT: srli a4, a2, 1 -; RV32IM-NEXT: lui a5, 820904 -; RV32IM-NEXT: addi a5, a5, -1903 -; RV32IM-NEXT: mulhu a4, a4, a5 -; RV32IM-NEXT: srli a4, a4, 8 -; RV32IM-NEXT: addi a5, zero, 654 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a2, a2, a4 -; RV32IM-NEXT: lui a4, 729444 -; RV32IM-NEXT: addi a4, a4, 713 -; RV32IM-NEXT: mulhu a4, a1, a4 -; RV32IM-NEXT: srli a4, a4, 4 -; RV32IM-NEXT: addi a5, zero, 23 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a1, a1, a4 -; RV32IM-NEXT: lui a4, 395996 -; RV32IM-NEXT: addi a4, a4, -2009 -; RV32IM-NEXT: mulhu a4, a3, a4 -; RV32IM-NEXT: srli a4, a4, 11 -; RV32IM-NEXT: lui a5, 1 -; RV32IM-NEXT: addi a5, a5, 1327 -; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: lhu a6, 12(a1) +; RV32IM-NEXT: lhu a3, 8(a1) +; RV32IM-NEXT: lhu a1, 4(a1) +; RV32IM-NEXT: lui a4, 6 +; RV32IM-NEXT: addi a4, a4, 1069 +; RV32IM-NEXT: mul a4, a3, a4 +; RV32IM-NEXT: srli a4, a4, 16 +; RV32IM-NEXT: sub a5, a3, a4 +; RV32IM-NEXT: lui a2, 16 +; RV32IM-NEXT: addi a2, a2, -2 +; RV32IM-NEXT: and a2, a5, a2 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: srli a2, a2, 4 +; RV32IM-NEXT: addi a4, zero, 23 +; RV32IM-NEXT: mul a2, a2, a4 +; RV32IM-NEXT: sub a2, a3, a2 +; RV32IM-NEXT: lui a3, 13 +; RV32IM-NEXT: addi a3, a3, -1941 +; RV32IM-NEXT: mul a3, a1, a3 +; RV32IM-NEXT: srli a3, a3, 25 +; RV32IM-NEXT: addi a4, zero, 654 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: sub a1, a1, a3 +; RV32IM-NEXT: lui a3, 3 +; RV32IM-NEXT: addi a3, a3, 87 +; RV32IM-NEXT: mul a3, a6, a3 +; RV32IM-NEXT: srli a3, a3, 26 +; RV32IM-NEXT: lui a4, 1 +; RV32IM-NEXT: addi a4, a4, 1327 +; RV32IM-NEXT: mul a3, a3, a4 +; RV32IM-NEXT: sub a3, a6, a3 ; RV32IM-NEXT: sh zero, 0(a0) ; RV32IM-NEXT: sh a3, 6(a0) -; RV32IM-NEXT: sh a1, 4(a0) -; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh a2, 4(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_one: @@ -853,26 +894,50 @@ ; RV64I-NEXT: sd s1, 24(sp) ; RV64I-NEXT: sd s2, 16(sp) ; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: sd s4, 0(sp) ; RV64I-NEXT: lhu s2, 24(a1) +; RV64I-NEXT: lhu s3, 8(a1) ; RV64I-NEXT: lhu s1, 16(a1) -; RV64I-NEXT: lhu a2, 8(a1) ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, 654 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s3, a0 -; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: lui a0, 6 +; RV64I-NEXT: addiw a1, a0, 1069 ; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 16 +; RV64I-NEXT: sub a1, s1, a0 +; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: addiw a3, a2, -2 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: srli a1, a1, 1 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: addiw a1, a2, -16 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: srli a0, a0, 4 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s1, a0 +; RV64I-NEXT: lui a0, 13 +; RV64I-NEXT: addiw a1, a0, -1941 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 25 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s1, s3, a0 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a1, a0, 87 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 26 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s2, a0 ; RV64I-NEXT: sh zero, 0(s0) ; RV64I-NEXT: sh a0, 6(s0) -; RV64I-NEXT: sh s1, 4(s0) -; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: sh s1, 2(s0) +; RV64I-NEXT: sh s4, 4(s0) +; RV64I-NEXT: ld s4, 0(sp) ; RV64I-NEXT: ld s3, 8(sp) ; RV64I-NEXT: ld s2, 16(sp) ; RV64I-NEXT: ld s1, 24(sp) @@ -883,57 +948,42 @@ ; ; RV64IM-LABEL: dont_fold_urem_one: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a2, 24(a1) -; RV64IM-NEXT: lhu a3, 8(a1) -; RV64IM-NEXT: lhu a1, 16(a1) -; RV64IM-NEXT: lui a4, 3206 -; RV64IM-NEXT: addiw a4, a4, -1781 -; RV64IM-NEXT: slli a4, a4, 13 -; RV64IM-NEXT: addi a4, a4, 1069 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -1959 -; RV64IM-NEXT: slli a4, a4, 14 -; RV64IM-NEXT: addi a4, a4, 713 -; RV64IM-NEXT: mulhu a4, a1, a4 -; RV64IM-NEXT: sub a5, a1, a4 -; RV64IM-NEXT: srli a5, a5, 1 -; RV64IM-NEXT: add a4, a5, a4 -; RV64IM-NEXT: srli a4, a4, 4 -; RV64IM-NEXT: addi a5, zero, 23 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a1, a1, a4 -; RV64IM-NEXT: srli a4, a3, 1 -; RV64IM-NEXT: lui a5, 6413 -; RV64IM-NEXT: addiw a5, a5, 1265 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, 1027 -; RV64IM-NEXT: slli a5, a5, 13 -; RV64IM-NEXT: addi a5, a5, 1077 -; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 965 -; RV64IM-NEXT: mulhu a4, a4, a5 -; RV64IM-NEXT: srli a4, a4, 7 -; RV64IM-NEXT: addi a5, zero, 654 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a3, a3, a4 -; RV64IM-NEXT: lui a4, 1044567 -; RV64IM-NEXT: addiw a4, a4, -575 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, 883 -; RV64IM-NEXT: slli a4, a4, 14 -; RV64IM-NEXT: addi a4, a4, -861 -; RV64IM-NEXT: slli a4, a4, 12 -; RV64IM-NEXT: addi a4, a4, -179 -; RV64IM-NEXT: mulhu a4, a2, a4 -; RV64IM-NEXT: srli a4, a4, 12 -; RV64IM-NEXT: lui a5, 1 -; RV64IM-NEXT: addiw a5, a5, 1327 -; RV64IM-NEXT: mul a4, a4, a5 -; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: lhu a6, 24(a1) +; RV64IM-NEXT: lhu a3, 16(a1) +; RV64IM-NEXT: lhu a1, 8(a1) +; RV64IM-NEXT: lui a4, 6 +; RV64IM-NEXT: addiw a4, a4, 1069 +; RV64IM-NEXT: mul a4, a3, a4 +; RV64IM-NEXT: srli a4, a4, 16 +; RV64IM-NEXT: sub a5, a3, a4 +; RV64IM-NEXT: lui a2, 16 +; RV64IM-NEXT: addiw a2, a2, -2 +; RV64IM-NEXT: and a2, a5, a2 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a2, a2, a4 +; RV64IM-NEXT: srli a2, a2, 4 +; RV64IM-NEXT: addi a4, zero, 23 +; RV64IM-NEXT: mul a2, a2, a4 +; RV64IM-NEXT: sub a2, a3, a2 +; RV64IM-NEXT: lui a3, 13 +; RV64IM-NEXT: addiw a3, a3, -1941 +; RV64IM-NEXT: mul a3, a1, a3 +; RV64IM-NEXT: srli a3, a3, 25 +; RV64IM-NEXT: addi a4, zero, 654 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: sub a1, a1, a3 +; RV64IM-NEXT: lui a3, 3 +; RV64IM-NEXT: addiw a3, a3, 87 +; RV64IM-NEXT: mul a3, a6, a3 +; RV64IM-NEXT: srli a3, a3, 26 +; RV64IM-NEXT: lui a4, 1 +; RV64IM-NEXT: addiw a4, a4, 1327 +; RV64IM-NEXT: mul a3, a3, a4 +; RV64IM-NEXT: sub a3, a6, a3 ; RV64IM-NEXT: sh zero, 0(a0) -; RV64IM-NEXT: sh a2, 6(a0) -; RV64IM-NEXT: sh a3, 2(a0) -; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: sh a1, 2(a0) +; RV64IM-NEXT: sh a2, 4(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -952,180 +1002,545 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32I-LABEL: dont_fold_urem_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -48 -; RV32I-NEXT: sw ra, 44(sp) -; RV32I-NEXT: sw s0, 40(sp) -; RV32I-NEXT: sw s1, 36(sp) -; RV32I-NEXT: sw s2, 32(sp) -; RV32I-NEXT: sw s3, 28(sp) -; RV32I-NEXT: sw s4, 24(sp) -; RV32I-NEXT: sw s5, 20(sp) -; RV32I-NEXT: sw s6, 16(sp) -; RV32I-NEXT: sw s7, 12(sp) -; RV32I-NEXT: sw s8, 8(sp) -; RV32I-NEXT: sw s9, 4(sp) -; RV32I-NEXT: lw s2, 24(a1) -; RV32I-NEXT: lw s3, 28(a1) -; RV32I-NEXT: lw s4, 16(a1) -; RV32I-NEXT: lw s5, 20(a1) -; RV32I-NEXT: lw s6, 8(a1) -; RV32I-NEXT: lw s1, 12(a1) -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a2, zero, 1 -; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __umoddi3 -; RV32I-NEXT: mv s7, a0 -; RV32I-NEXT: mv s8, a1 +; RV32I-NEXT: addi sp, sp, -464 +; RV32I-NEXT: sw ra, 460(sp) +; RV32I-NEXT: sw s0, 456(sp) +; RV32I-NEXT: sw s1, 452(sp) +; RV32I-NEXT: sw s2, 448(sp) +; RV32I-NEXT: sw s3, 444(sp) +; RV32I-NEXT: sw s4, 440(sp) +; RV32I-NEXT: sw s5, 436(sp) +; RV32I-NEXT: sw s6, 432(sp) +; RV32I-NEXT: sw s7, 428(sp) +; RV32I-NEXT: sw s8, 424(sp) +; RV32I-NEXT: sw s9, 420(sp) +; RV32I-NEXT: sw s10, 416(sp) +; RV32I-NEXT: sw s11, 412(sp) +; RV32I-NEXT: lw a2, 0(a1) +; RV32I-NEXT: sw a2, 20(sp) +; RV32I-NEXT: lw a2, 4(a1) +; RV32I-NEXT: sw a2, 16(sp) +; RV32I-NEXT: lw s6, 24(a1) +; RV32I-NEXT: lw s0, 28(a1) +; RV32I-NEXT: lw s7, 16(a1) +; RV32I-NEXT: lw s1, 20(a1) +; RV32I-NEXT: lw s8, 8(a1) +; RV32I-NEXT: lw s10, 12(a1) +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: sw zero, 180(sp) +; RV32I-NEXT: sw zero, 176(sp) +; RV32I-NEXT: sw zero, 196(sp) +; RV32I-NEXT: sw zero, 192(sp) +; RV32I-NEXT: lui a0, 410452 +; RV32I-NEXT: addi a0, a0, -952 +; RV32I-NEXT: sw a0, 172(sp) +; RV32I-NEXT: lui a0, 25653 +; RV32I-NEXT: addi a0, a0, 965 +; RV32I-NEXT: sw a0, 168(sp) +; RV32I-NEXT: srli a0, s10, 1 +; RV32I-NEXT: sw a0, 188(sp) +; RV32I-NEXT: slli a0, s10, 31 +; RV32I-NEXT: srli a1, s8, 1 +; RV32I-NEXT: or a3, a1, a0 +; RV32I-NEXT: addi a0, sp, 200 +; RV32I-NEXT: addi a1, sp, 184 +; RV32I-NEXT: addi a2, sp, 168 +; RV32I-NEXT: sw a3, 184(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 276(sp) +; RV32I-NEXT: sw zero, 272(sp) +; RV32I-NEXT: sw zero, 292(sp) +; RV32I-NEXT: sw zero, 288(sp) +; RV32I-NEXT: lui a0, 410312 +; RV32I-NEXT: addi a0, a0, 1424 +; RV32I-NEXT: sw a0, 268(sp) +; RV32I-NEXT: lui a0, 729444 +; RV32I-NEXT: addi a0, a0, 713 +; RV32I-NEXT: sw a0, 264(sp) +; RV32I-NEXT: sw s1, 284(sp) +; RV32I-NEXT: mv s3, s1 +; RV32I-NEXT: sw s1, 8(sp) +; RV32I-NEXT: addi a0, sp, 296 +; RV32I-NEXT: addi a1, sp, 280 +; RV32I-NEXT: addi a2, sp, 264 +; RV32I-NEXT: sw s7, 280(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 372(sp) +; RV32I-NEXT: sw zero, 368(sp) +; RV32I-NEXT: sw zero, 388(sp) +; RV32I-NEXT: sw zero, 384(sp) +; RV32I-NEXT: lui a0, 791991 +; RV32I-NEXT: addi a0, a0, 77 +; RV32I-NEXT: sw a0, 364(sp) +; RV32I-NEXT: lui a0, 834723 +; RV32I-NEXT: addi a0, a0, -179 +; RV32I-NEXT: sw a0, 360(sp) +; RV32I-NEXT: sw s0, 380(sp) +; RV32I-NEXT: mv s4, s0 +; RV32I-NEXT: sw s0, 12(sp) +; RV32I-NEXT: addi a0, sp, 392 +; RV32I-NEXT: addi a1, sp, 376 +; RV32I-NEXT: addi a2, sp, 360 +; RV32I-NEXT: sw s6, 376(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 132(sp) +; RV32I-NEXT: sw zero, 128(sp) +; RV32I-NEXT: sw zero, 124(sp) +; RV32I-NEXT: sw zero, 120(sp) +; RV32I-NEXT: sw zero, 148(sp) +; RV32I-NEXT: sw zero, 144(sp) +; RV32I-NEXT: lw s0, 208(sp) +; RV32I-NEXT: sub a0, s8, s0 +; RV32I-NEXT: sw a0, 136(sp) +; RV32I-NEXT: lw s1, 212(sp) +; RV32I-NEXT: sltu a0, s8, s0 +; RV32I-NEXT: sub a1, s10, s1 +; RV32I-NEXT: sub a3, a1, a0 +; RV32I-NEXT: addi a0, sp, 152 +; RV32I-NEXT: addi a1, sp, 136 +; RV32I-NEXT: addi a2, sp, 120 +; RV32I-NEXT: sw a3, 140(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 228(sp) +; RV32I-NEXT: sw zero, 224(sp) +; RV32I-NEXT: lui a0, 524288 +; RV32I-NEXT: sw a0, 220(sp) +; RV32I-NEXT: sw zero, 216(sp) +; RV32I-NEXT: sw zero, 244(sp) +; RV32I-NEXT: sw zero, 240(sp) +; RV32I-NEXT: lw s2, 304(sp) +; RV32I-NEXT: sub a0, s7, s2 +; RV32I-NEXT: sw a0, 232(sp) +; RV32I-NEXT: lw s9, 308(sp) +; RV32I-NEXT: sltu a0, s7, s2 +; RV32I-NEXT: sub a1, s3, s9 +; RV32I-NEXT: sub a3, a1, a0 +; RV32I-NEXT: addi a0, sp, 248 +; RV32I-NEXT: addi a1, sp, 232 +; RV32I-NEXT: addi a2, sp, 216 +; RV32I-NEXT: sw a3, 236(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: sw zero, 324(sp) +; RV32I-NEXT: sw zero, 320(sp) +; RV32I-NEXT: sw zero, 316(sp) +; RV32I-NEXT: sw zero, 312(sp) +; RV32I-NEXT: sw zero, 340(sp) +; RV32I-NEXT: sw zero, 336(sp) +; RV32I-NEXT: lw s3, 400(sp) +; RV32I-NEXT: sub a0, s6, s3 +; RV32I-NEXT: sw a0, 328(sp) +; RV32I-NEXT: lw s11, 404(sp) +; RV32I-NEXT: sltu a0, s6, s3 +; RV32I-NEXT: sub a1, s4, s11 +; RV32I-NEXT: sub a3, a1, a0 +; RV32I-NEXT: addi a0, sp, 344 +; RV32I-NEXT: addi a1, sp, 328 +; RV32I-NEXT: addi a2, sp, 312 +; RV32I-NEXT: sw a3, 332(sp) +; RV32I-NEXT: call __multi3 +; RV32I-NEXT: lw a0, 164(sp) +; RV32I-NEXT: lw a1, 160(sp) +; RV32I-NEXT: add a0, a0, s1 +; RV32I-NEXT: add a2, a1, s0 +; RV32I-NEXT: sltu a1, a2, a1 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: slli a0, a1, 25 +; RV32I-NEXT: srli a2, a2, 7 +; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: srli a1, a1, 7 ; RV32I-NEXT: addi a2, zero, 654 -; RV32I-NEXT: mv a0, s6 -; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __umoddi3 -; RV32I-NEXT: mv s6, a0 -; RV32I-NEXT: mv s9, a1 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: sub a0, s10, a1 +; RV32I-NEXT: lw a1, 260(sp) +; RV32I-NEXT: lw a2, 256(sp) +; RV32I-NEXT: sltu a3, s8, s4 +; RV32I-NEXT: sub s10, a0, a3 +; RV32I-NEXT: add a0, a1, s9 +; RV32I-NEXT: add a1, a2, s2 +; RV32I-NEXT: sltu a2, a1, a2 +; RV32I-NEXT: add a2, a0, a2 +; RV32I-NEXT: slli a0, a2, 28 +; RV32I-NEXT: srli a1, a1, 4 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: srli a1, a2, 4 ; RV32I-NEXT: addi a2, zero, 23 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s5 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __umoddi3 -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a2, a0, 1327 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: mv s9, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: lw a1, 356(sp) +; RV32I-NEXT: lw a2, 352(sp) +; RV32I-NEXT: sltu a3, s7, s9 +; RV32I-NEXT: sub s2, a0, a3 +; RV32I-NEXT: add a0, a1, s11 +; RV32I-NEXT: add a1, a2, s3 +; RV32I-NEXT: sltu a2, a1, a2 +; RV32I-NEXT: add a2, a0, a2 +; RV32I-NEXT: slli a0, a2, 20 +; RV32I-NEXT: srli a1, a1, 12 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: srli a1, a2, 12 +; RV32I-NEXT: lui a2, 1 +; RV32I-NEXT: addi a2, a2, 1327 ; RV32I-NEXT: mv a3, zero -; RV32I-NEXT: call __umoddi3 -; RV32I-NEXT: sw a1, 28(s0) -; RV32I-NEXT: sw a0, 24(s0) -; RV32I-NEXT: sw s1, 20(s0) -; RV32I-NEXT: sw s4, 16(s0) -; RV32I-NEXT: sw s9, 12(s0) -; RV32I-NEXT: sw s6, 8(s0) -; RV32I-NEXT: sw s8, 4(s0) -; RV32I-NEXT: sw s7, 0(s0) -; RV32I-NEXT: lw s9, 4(sp) -; RV32I-NEXT: lw s8, 8(sp) -; RV32I-NEXT: lw s7, 12(sp) -; RV32I-NEXT: lw s6, 16(sp) -; RV32I-NEXT: lw s5, 20(sp) -; RV32I-NEXT: lw s4, 24(sp) -; RV32I-NEXT: lw s3, 28(sp) -; RV32I-NEXT: lw s2, 32(sp) -; RV32I-NEXT: lw s1, 36(sp) -; RV32I-NEXT: lw s0, 40(sp) -; RV32I-NEXT: lw ra, 44(sp) -; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: mv s11, a0 +; RV32I-NEXT: sltu a0, s6, a0 +; RV32I-NEXT: lw a2, 12(sp) +; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: sub s3, a1, a0 +; RV32I-NEXT: addi a2, zero, 1 +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: lw s0, 16(sp) +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: sltu a2, s1, a0 +; RV32I-NEXT: sub a1, s0, a1 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a2, s8, s4 +; RV32I-NEXT: sub a3, s7, s9 +; RV32I-NEXT: sub a4, s6, s11 +; RV32I-NEXT: sub a0, s1, a0 +; RV32I-NEXT: sw a0, 0(s5) +; RV32I-NEXT: sw a1, 4(s5) +; RV32I-NEXT: sw a4, 24(s5) +; RV32I-NEXT: sw a3, 16(s5) +; RV32I-NEXT: sw s3, 28(s5) +; RV32I-NEXT: sw s2, 20(s5) +; RV32I-NEXT: sw a2, 8(s5) +; RV32I-NEXT: sw s10, 12(s5) +; RV32I-NEXT: lw s11, 412(sp) +; RV32I-NEXT: lw s10, 416(sp) +; RV32I-NEXT: lw s9, 420(sp) +; RV32I-NEXT: lw s8, 424(sp) +; RV32I-NEXT: lw s7, 428(sp) +; RV32I-NEXT: lw s6, 432(sp) +; RV32I-NEXT: lw s5, 436(sp) +; RV32I-NEXT: lw s4, 440(sp) +; RV32I-NEXT: lw s3, 444(sp) +; RV32I-NEXT: lw s2, 448(sp) +; RV32I-NEXT: lw s1, 452(sp) +; RV32I-NEXT: lw s0, 456(sp) +; RV32I-NEXT: lw ra, 460(sp) +; RV32I-NEXT: addi sp, sp, 464 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: dont_fold_urem_i64: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -48 -; RV32IM-NEXT: sw ra, 44(sp) -; RV32IM-NEXT: sw s0, 40(sp) -; RV32IM-NEXT: sw s1, 36(sp) -; RV32IM-NEXT: sw s2, 32(sp) -; RV32IM-NEXT: sw s3, 28(sp) -; RV32IM-NEXT: sw s4, 24(sp) -; RV32IM-NEXT: sw s5, 20(sp) -; RV32IM-NEXT: sw s6, 16(sp) -; RV32IM-NEXT: sw s7, 12(sp) -; RV32IM-NEXT: sw s8, 8(sp) -; RV32IM-NEXT: sw s9, 4(sp) +; RV32IM-NEXT: addi sp, sp, -448 +; RV32IM-NEXT: sw ra, 444(sp) +; RV32IM-NEXT: sw s0, 440(sp) +; RV32IM-NEXT: sw s1, 436(sp) +; RV32IM-NEXT: sw s2, 432(sp) +; RV32IM-NEXT: sw s3, 428(sp) +; RV32IM-NEXT: sw s4, 424(sp) +; RV32IM-NEXT: sw s5, 420(sp) +; RV32IM-NEXT: sw s6, 416(sp) +; RV32IM-NEXT: sw s7, 412(sp) +; RV32IM-NEXT: sw s8, 408(sp) +; RV32IM-NEXT: sw s9, 404(sp) +; RV32IM-NEXT: sw s10, 400(sp) +; RV32IM-NEXT: sw s11, 396(sp) ; RV32IM-NEXT: lw s2, 24(a1) -; RV32IM-NEXT: lw s3, 28(a1) -; RV32IM-NEXT: lw s4, 16(a1) -; RV32IM-NEXT: lw s5, 20(a1) +; RV32IM-NEXT: lw s5, 28(a1) +; RV32IM-NEXT: lw s3, 16(a1) +; RV32IM-NEXT: lw s1, 20(a1) ; RV32IM-NEXT: lw s6, 8(a1) -; RV32IM-NEXT: lw s1, 12(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a1, 4(a1) -; RV32IM-NEXT: mv s0, a0 -; RV32IM-NEXT: addi a2, zero, 1 -; RV32IM-NEXT: mv a0, a3 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: mv s7, a0 -; RV32IM-NEXT: mv s8, a1 -; RV32IM-NEXT: addi a2, zero, 654 -; RV32IM-NEXT: mv a0, s6 -; RV32IM-NEXT: mv a1, s1 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: mv s6, a0 -; RV32IM-NEXT: mv s9, a1 -; RV32IM-NEXT: addi a2, zero, 23 -; RV32IM-NEXT: mv a0, s4 -; RV32IM-NEXT: mv a1, s5 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: mv s4, a0 -; RV32IM-NEXT: mv s1, a1 -; RV32IM-NEXT: lui a0, 1 -; RV32IM-NEXT: addi a2, a0, 1327 -; RV32IM-NEXT: mv a0, s2 -; RV32IM-NEXT: mv a1, s3 -; RV32IM-NEXT: mv a3, zero -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: sw a1, 28(s0) -; RV32IM-NEXT: sw a0, 24(s0) -; RV32IM-NEXT: sw s1, 20(s0) -; RV32IM-NEXT: sw s4, 16(s0) -; RV32IM-NEXT: sw s9, 12(s0) -; RV32IM-NEXT: sw s6, 8(s0) -; RV32IM-NEXT: sw s8, 4(s0) -; RV32IM-NEXT: sw s7, 0(s0) -; RV32IM-NEXT: lw s9, 4(sp) -; RV32IM-NEXT: lw s8, 8(sp) -; RV32IM-NEXT: lw s7, 12(sp) -; RV32IM-NEXT: lw s6, 16(sp) -; RV32IM-NEXT: lw s5, 20(sp) -; RV32IM-NEXT: lw s4, 24(sp) -; RV32IM-NEXT: lw s3, 28(sp) -; RV32IM-NEXT: lw s2, 32(sp) -; RV32IM-NEXT: lw s1, 36(sp) -; RV32IM-NEXT: lw s0, 40(sp) -; RV32IM-NEXT: lw ra, 44(sp) -; RV32IM-NEXT: addi sp, sp, 48 +; RV32IM-NEXT: lw s7, 12(a1) +; RV32IM-NEXT: mv s9, a0 +; RV32IM-NEXT: sw zero, 164(sp) +; RV32IM-NEXT: sw zero, 160(sp) +; RV32IM-NEXT: sw zero, 180(sp) +; RV32IM-NEXT: sw zero, 176(sp) +; RV32IM-NEXT: lui a0, 410452 +; RV32IM-NEXT: addi a0, a0, -952 +; RV32IM-NEXT: sw a0, 156(sp) +; RV32IM-NEXT: lui a0, 25653 +; RV32IM-NEXT: addi a0, a0, 965 +; RV32IM-NEXT: sw a0, 152(sp) +; RV32IM-NEXT: srli a0, s7, 1 +; RV32IM-NEXT: sw a0, 172(sp) +; RV32IM-NEXT: slli a0, s7, 31 +; RV32IM-NEXT: srli a1, s6, 1 +; RV32IM-NEXT: or a3, a1, a0 +; RV32IM-NEXT: addi a0, sp, 184 +; RV32IM-NEXT: addi a1, sp, 168 +; RV32IM-NEXT: addi a2, sp, 152 +; RV32IM-NEXT: sw a3, 168(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 260(sp) +; RV32IM-NEXT: sw zero, 256(sp) +; RV32IM-NEXT: sw zero, 276(sp) +; RV32IM-NEXT: sw zero, 272(sp) +; RV32IM-NEXT: lui a0, 410312 +; RV32IM-NEXT: addi a0, a0, 1424 +; RV32IM-NEXT: sw a0, 252(sp) +; RV32IM-NEXT: lui a0, 729444 +; RV32IM-NEXT: addi a0, a0, 713 +; RV32IM-NEXT: sw a0, 248(sp) +; RV32IM-NEXT: sw s1, 268(sp) +; RV32IM-NEXT: sw s1, 4(sp) +; RV32IM-NEXT: addi a0, sp, 280 +; RV32IM-NEXT: addi a1, sp, 264 +; RV32IM-NEXT: addi a2, sp, 248 +; RV32IM-NEXT: sw s3, 264(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 356(sp) +; RV32IM-NEXT: sw zero, 352(sp) +; RV32IM-NEXT: sw zero, 372(sp) +; RV32IM-NEXT: sw zero, 368(sp) +; RV32IM-NEXT: lui a0, 791991 +; RV32IM-NEXT: addi a0, a0, 77 +; RV32IM-NEXT: sw a0, 348(sp) +; RV32IM-NEXT: lui a0, 834723 +; RV32IM-NEXT: addi a0, a0, -179 +; RV32IM-NEXT: sw a0, 344(sp) +; RV32IM-NEXT: sw s5, 364(sp) +; RV32IM-NEXT: addi a0, sp, 376 +; RV32IM-NEXT: addi a1, sp, 360 +; RV32IM-NEXT: addi a2, sp, 344 +; RV32IM-NEXT: sw s2, 360(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 116(sp) +; RV32IM-NEXT: sw zero, 112(sp) +; RV32IM-NEXT: sw zero, 108(sp) +; RV32IM-NEXT: sw zero, 104(sp) +; RV32IM-NEXT: sw zero, 132(sp) +; RV32IM-NEXT: sw zero, 128(sp) +; RV32IM-NEXT: lw s11, 192(sp) +; RV32IM-NEXT: sub a0, s6, s11 +; RV32IM-NEXT: sw a0, 120(sp) +; RV32IM-NEXT: lw s8, 196(sp) +; RV32IM-NEXT: sltu a0, s6, s11 +; RV32IM-NEXT: sub a1, s7, s8 +; RV32IM-NEXT: sub a3, a1, a0 +; RV32IM-NEXT: addi a0, sp, 136 +; RV32IM-NEXT: addi a1, sp, 120 +; RV32IM-NEXT: addi a2, sp, 104 +; RV32IM-NEXT: sw a3, 124(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 212(sp) +; RV32IM-NEXT: sw zero, 208(sp) +; RV32IM-NEXT: lui a0, 524288 +; RV32IM-NEXT: sw a0, 204(sp) +; RV32IM-NEXT: sw zero, 200(sp) +; RV32IM-NEXT: sw zero, 228(sp) +; RV32IM-NEXT: sw zero, 224(sp) +; RV32IM-NEXT: lw s0, 288(sp) +; RV32IM-NEXT: sub a0, s3, s0 +; RV32IM-NEXT: sw a0, 216(sp) +; RV32IM-NEXT: lw s10, 292(sp) +; RV32IM-NEXT: sltu a0, s3, s0 +; RV32IM-NEXT: sub a1, s1, s10 +; RV32IM-NEXT: sub a3, a1, a0 +; RV32IM-NEXT: addi a0, sp, 232 +; RV32IM-NEXT: addi a1, sp, 216 +; RV32IM-NEXT: addi a2, sp, 200 +; RV32IM-NEXT: sw a3, 220(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: sw zero, 308(sp) +; RV32IM-NEXT: sw zero, 304(sp) +; RV32IM-NEXT: sw zero, 300(sp) +; RV32IM-NEXT: sw zero, 296(sp) +; RV32IM-NEXT: sw zero, 324(sp) +; RV32IM-NEXT: sw zero, 320(sp) +; RV32IM-NEXT: lw s1, 384(sp) +; RV32IM-NEXT: sub a0, s2, s1 +; RV32IM-NEXT: sw a0, 312(sp) +; RV32IM-NEXT: lw s4, 388(sp) +; RV32IM-NEXT: sltu a0, s2, s1 +; RV32IM-NEXT: sub a1, s5, s4 +; RV32IM-NEXT: sub a3, a1, a0 +; RV32IM-NEXT: addi a0, sp, 328 +; RV32IM-NEXT: addi a1, sp, 312 +; RV32IM-NEXT: addi a2, sp, 296 +; RV32IM-NEXT: sw a3, 316(sp) +; RV32IM-NEXT: call __multi3 +; RV32IM-NEXT: lw a0, 148(sp) +; RV32IM-NEXT: lw a1, 144(sp) +; RV32IM-NEXT: add a0, a0, s8 +; RV32IM-NEXT: add a2, a1, s11 +; RV32IM-NEXT: sltu a1, a2, a1 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: srli a1, a0, 7 +; RV32IM-NEXT: addi a3, zero, 654 +; RV32IM-NEXT: mul a1, a1, a3 +; RV32IM-NEXT: slli a0, a0, 25 +; RV32IM-NEXT: srli a2, a2, 7 +; RV32IM-NEXT: or a0, a2, a0 +; RV32IM-NEXT: mulhu a2, a0, a3 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: sub a1, s7, a1 +; RV32IM-NEXT: mul a0, a0, a3 +; RV32IM-NEXT: lw a2, 244(sp) +; RV32IM-NEXT: lw a3, 240(sp) +; RV32IM-NEXT: sltu a4, s6, a0 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: add a2, a2, s10 +; RV32IM-NEXT: add a4, a3, s0 +; RV32IM-NEXT: sltu a3, a4, a3 +; RV32IM-NEXT: add a2, a2, a3 +; RV32IM-NEXT: srli a3, a2, 4 +; RV32IM-NEXT: addi a5, zero, 23 +; RV32IM-NEXT: mul a3, a3, a5 +; RV32IM-NEXT: slli a2, a2, 28 +; RV32IM-NEXT: srli a4, a4, 4 +; RV32IM-NEXT: or a2, a4, a2 +; RV32IM-NEXT: mulhu a4, a2, a5 +; RV32IM-NEXT: add a3, a4, a3 +; RV32IM-NEXT: lw a4, 4(sp) +; RV32IM-NEXT: sub a3, a4, a3 +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: lw a4, 340(sp) +; RV32IM-NEXT: lw a5, 336(sp) +; RV32IM-NEXT: sltu s0, s3, a2 +; RV32IM-NEXT: sub a3, a3, s0 +; RV32IM-NEXT: add a4, a4, s4 +; RV32IM-NEXT: add s1, a5, s1 +; RV32IM-NEXT: sltu a5, s1, a5 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: srli a5, a4, 12 +; RV32IM-NEXT: lui s0, 1 +; RV32IM-NEXT: addi s0, s0, 1327 +; RV32IM-NEXT: mul a5, a5, s0 +; RV32IM-NEXT: slli a4, a4, 20 +; RV32IM-NEXT: srli s1, s1, 12 +; RV32IM-NEXT: or a4, s1, a4 +; RV32IM-NEXT: mulhu s1, a4, s0 +; RV32IM-NEXT: add a5, s1, a5 +; RV32IM-NEXT: sub a5, s5, a5 +; RV32IM-NEXT: mul a4, a4, s0 +; RV32IM-NEXT: sltu s1, s2, a4 +; RV32IM-NEXT: sub a5, a5, s1 +; RV32IM-NEXT: sub a0, s6, a0 +; RV32IM-NEXT: sub a2, s3, a2 +; RV32IM-NEXT: sub a4, s2, a4 +; RV32IM-NEXT: sw zero, 4(s9) +; RV32IM-NEXT: sw zero, 0(s9) +; RV32IM-NEXT: sw a4, 24(s9) +; RV32IM-NEXT: sw a2, 16(s9) +; RV32IM-NEXT: sw a5, 28(s9) +; RV32IM-NEXT: sw a3, 20(s9) +; RV32IM-NEXT: sw a0, 8(s9) +; RV32IM-NEXT: sw a1, 12(s9) +; RV32IM-NEXT: lw s11, 396(sp) +; RV32IM-NEXT: lw s10, 400(sp) +; RV32IM-NEXT: lw s9, 404(sp) +; RV32IM-NEXT: lw s8, 408(sp) +; RV32IM-NEXT: lw s7, 412(sp) +; RV32IM-NEXT: lw s6, 416(sp) +; RV32IM-NEXT: lw s5, 420(sp) +; RV32IM-NEXT: lw s4, 424(sp) +; RV32IM-NEXT: lw s3, 428(sp) +; RV32IM-NEXT: lw s2, 432(sp) +; RV32IM-NEXT: lw s1, 436(sp) +; RV32IM-NEXT: lw s0, 440(sp) +; RV32IM-NEXT: lw ra, 444(sp) +; RV32IM-NEXT: addi sp, sp, 448 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -48 -; RV64I-NEXT: sd ra, 40(sp) -; RV64I-NEXT: sd s0, 32(sp) -; RV64I-NEXT: sd s1, 24(sp) -; RV64I-NEXT: sd s2, 16(sp) -; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) ; RV64I-NEXT: ld s2, 24(a1) -; RV64I-NEXT: ld s1, 16(a1) -; RV64I-NEXT: ld a2, 8(a1) -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, zero, 654 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: ld s5, 16(a1) +; RV64I-NEXT: ld s1, 8(a1) ; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: srli a0, s1, 1 +; RV64I-NEXT: lui a1, 6413 +; RV64I-NEXT: addiw a1, a1, 1265 +; RV64I-NEXT: slli a1, a1, 13 +; RV64I-NEXT: addi a1, a1, 1027 +; RV64I-NEXT: slli a1, a1, 13 +; RV64I-NEXT: addi a1, a1, 1077 +; RV64I-NEXT: slli a1, a1, 12 +; RV64I-NEXT: addi a2, a1, 965 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: sub a0, s1, a1 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a2, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: add a0, a1, s0 +; RV64I-NEXT: srli a0, a0, 7 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s4, s1, a0 +; RV64I-NEXT: lui a0, 3206 +; RV64I-NEXT: addiw a0, a0, -1781 +; RV64I-NEXT: slli a0, a0, 13 +; RV64I-NEXT: addi a0, a0, 1069 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a0, a0, -1959 +; RV64I-NEXT: slli a0, a0, 14 +; RV64I-NEXT: addi a2, a0, 713 +; RV64I-NEXT: mv a0, s5 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: sub a0, s5, a1 +; RV64I-NEXT: addi a1, zero, -1 +; RV64I-NEXT: slli a2, a1, 63 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: add a0, a1, s0 +; RV64I-NEXT: srli a0, a0, 4 ; RV64I-NEXT: addi a1, zero, 23 -; RV64I-NEXT: mv a0, s1 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub s1, s5, a0 +; RV64I-NEXT: lui a0, 1044567 +; RV64I-NEXT: addiw a0, a0, -575 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a0, a0, 883 +; RV64I-NEXT: slli a0, a0, 14 +; RV64I-NEXT: addi a0, a0, -861 +; RV64I-NEXT: slli a0, a0, 12 +; RV64I-NEXT: addi a2, a0, -179 ; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: sd zero, 0(s0) -; RV64I-NEXT: sd a0, 24(s0) -; RV64I-NEXT: sd s1, 16(s0) -; RV64I-NEXT: sd s3, 8(s0) -; RV64I-NEXT: ld s3, 8(sp) -; RV64I-NEXT: ld s2, 16(sp) -; RV64I-NEXT: ld s1, 24(sp) -; RV64I-NEXT: ld s0, 32(sp) -; RV64I-NEXT: ld ra, 40(sp) -; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: sub a0, s2, a1 +; RV64I-NEXT: mv a1, zero +; RV64I-NEXT: mv a2, zero +; RV64I-NEXT: mv a3, zero +; RV64I-NEXT: call __multi3 +; RV64I-NEXT: add a0, a1, s0 +; RV64I-NEXT: srli a0, a0, 12 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, 1327 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: sub a0, s2, a0 +; RV64I-NEXT: sd zero, 0(s3) +; RV64I-NEXT: sd a0, 24(s3) +; RV64I-NEXT: sd s1, 16(s3) +; RV64I-NEXT: sd s4, 8(s3) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: dont_fold_urem_i64: diff --git a/llvm/test/CodeGen/SPARC/rem.ll b/llvm/test/CodeGen/SPARC/rem.ll --- a/llvm/test/CodeGen/SPARC/rem.ll +++ b/llvm/test/CodeGen/SPARC/rem.ll @@ -30,13 +30,31 @@ ; CHECK-LABEL: test3: ; CHECK: .cfi_startproc ; CHECK-NEXT: ! %bb.0: ! %entry -; CHECK-NEXT: sethi 2545, %o1 -; CHECK-NEXT: or %o1, 379, %o1 -; CHECK-NEXT: mulx %o0, %o1, %o0 -; CHECK-NEXT: udivx %o0, 1021, %o1 -; CHECK-NEXT: mulx %o1, 1021, %o1 -; CHECK-NEXT: retl -; CHECK-NEXT: sub %o0, %o1, %o0 +; CHECK-NEXT: save %sp, -176, %sp +; CHECK-NEXT: .cfi_def_cfa_register %fp +; CHECK-NEXT: .cfi_window_save +; CHECK-NEXT: .cfi_register %o7, %i7 +; CHECK-NEXT: sethi 2545, %i1 +; CHECK-NEXT: or %i1, 379, %i1 +; CHECK-NEXT: mulx %i0, %i1, %i0 +; CHECK-NEXT: sethi 1331003, %i1 +; CHECK-NEXT: or %i1, 435, %i1 +; CHECK-NEXT: sethi 12324, %i2 +; CHECK-NEXT: or %i2, 108, %i2 +; CHECK-NEXT: sllx %i2, 32, %i2 +; CHECK-NEXT: or %i2, %i1, %o3 +; CHECK-NEXT: mov 0, %o0 +; CHECK-NEXT: mov %i0, %o1 +; CHECK-NEXT: call __multi3 +; CHECK-NEXT: mov %o0, %o2 +; CHECK-NEXT: sub %i0, %o0, %i1 +; CHECK-NEXT: srlx %i1, 1, %i1 +; CHECK-NEXT: add %i1, %o0, %i1 +; CHECK-NEXT: srlx %i1, 9, %i1 +; CHECK-NEXT: mulx %i1, 1021, %i1 +; CHECK-NEXT: sub %i0, %i1, %i0 +; CHECK-NEXT: ret +; CHECK-NEXT: restore entry: %mul = mul i64 %b, 2606459 %rem = urem i64 %mul, 1021 diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -294,19 +294,82 @@ define i64 @PR23590(i64 %x) nounwind { ; X32-LABEL: PR23590: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $12345 # imm = 0x3039 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $16, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $7 -; X32-NEXT: pushl %edx -; X32-NEXT: pushl %eax -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %ebp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl $417841695, %ebx # imm = 0x18E7C21F +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl $1425045447, %edx # imm = 0x54F077C7 +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb %al +; X32-NEXT: movzbl %al, %ebp +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl $1425045447, %edx # imm = 0x54F077C7 +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: shrdl $12, %ebx, %eax +; X32-NEXT: movl $12345, %edx # imm = 0x3039 +; X32-NEXT: mull %edx +; X32-NEXT: shrl $12, %ebx +; X32-NEXT: imull $12345, %ebx, %edi # imm = 0x3039 +; X32-NEXT: addl %edx, %edi +; X32-NEXT: subl %eax, %esi +; X32-NEXT: sbbl %edi, %ecx +; X32-NEXT: movl $-1840700269, %ebp # imm = 0x92492493 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl $613566756, %edx # imm = 0x24924924 +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: adcl %edi, %ebx +; X32-NEXT: setb %al +; X32-NEXT: movzbl %al, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl $613566756, %edx # imm = 0x24924924 +; X32-NEXT: mull %edx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: adcl %edi, %edx +; X32-NEXT: subl %eax, %esi +; X32-NEXT: sbbl %edx, %ecx +; X32-NEXT: movl %ecx, %edi +; X32-NEXT: shrl %edi +; X32-NEXT: shldl $31, %esi, %ecx +; X32-NEXT: addl %eax, %ecx +; X32-NEXT: adcl %edx, %edi +; X32-NEXT: shrdl $2, %edi, %ecx +; X32-NEXT: shrl $2, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edi, %edx +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx +; X32-NEXT: popl %ebp ; X32-NEXT: retl ; ; X64-FAST-LABEL: PR23590: @@ -355,27 +418,40 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: subl $12, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $-294967296 # imm = 0xEE6B2800 -; X32-NEXT: pushl %ebp -; X32-NEXT: pushl %ebx -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $16, %esp -; X32-NEXT: movl %eax, %esi +; X32-NEXT: pushl %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, %edi +; X32-NEXT: shldl $21, %ecx, %edi +; X32-NEXT: movl $-400107883, %ebx # imm = 0xE826D695 +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: shrl $11, %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl $1125899, %edx # imm = 0x112E0B +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %edx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $-294967296 # imm = 0xEE6B2800 -; X32-NEXT: pushl %ebp -; X32-NEXT: pushl %ebx -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $16, %esp -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: setb %al +; X32-NEXT: movzbl %al, %ebx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %edi, %edx -; X32-NEXT: addl $12, %esp +; X32-NEXT: movl $1125899, %edx # imm = 0x112E0B +; X32-NEXT: mull %edx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: adcl %ebx, %edx +; X32-NEXT: shrdl $9, %edx, %eax +; X32-NEXT: imull $-294967296, %eax, %esi # imm = 0xEE6B2800 +; X32-NEXT: subl %esi, %ecx +; X32-NEXT: shrl $9, %edx +; X32-NEXT: addl $4, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -409,27 +485,56 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: subl $12, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: pushl %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $-294967296 # imm = 0xEE6B2800 -; X32-NEXT: pushl %ebp -; X32-NEXT: pushl %ebx -; X32-NEXT: calll __divdi3 -; X32-NEXT: addl $16, %esp -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: sarl $31, %esi +; X32-NEXT: movl $651596979, %edi # imm = 0x26D694B3 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl $288230376, %edx # imm = 0x112E0BE8 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: setb %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl $288230376, %edx # imm = 0x112E0BE8 +; X32-NEXT: mull %edx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $-294967296 # imm = 0xEE6B2800 -; X32-NEXT: pushl %ebp -; X32-NEXT: pushl %ebx -; X32-NEXT: calll __moddi3 -; X32-NEXT: addl $16, %esp -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %edi, %edx -; X32-NEXT: addl $12, %esp +; X32-NEXT: movl $651596979, %edx # imm = 0x26D694B3 +; X32-NEXT: mull %edx +; X32-NEXT: imull $288230376, %esi, %ebx # imm = 0x112E0BE8 +; X32-NEXT: addl %edx, %ebx +; X32-NEXT: imull $651596979, %esi, %esi # imm = 0x26D694B3 +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl %esi, %edx +; X32-NEXT: sarl $28, %edx +; X32-NEXT: shrdl $28, %esi, %eax +; X32-NEXT: shrl $31, %esi +; X32-NEXT: addl %eax, %esi +; X32-NEXT: adcl $0, %edx +; X32-NEXT: imull $-294967296, %esi, %eax # imm = 0xEE6B2800 +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: addl $4, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -7,25 +7,97 @@ define i64 @mod128(i128 %x) nounwind { ; X86-64-LABEL: mod128: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __modti3 -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: pushq %rbx +; X86-64-NEXT: movq %rdi, %r9 +; X86-64-NEXT: movabsq $6148914691236517206, %r10 # imm = 0x5555555555555556 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: movq %rdx, %r8 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: movq %rdx, %r11 +; X86-64-NEXT: movq %rax, %rdi +; X86-64-NEXT: addq %r8, %rdi +; X86-64-NEXT: adcq $0, %r11 +; X86-64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X86-64-NEXT: movq %r9, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %r8 +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %r11, %r8 +; X86-64-NEXT: setb %al +; X86-64-NEXT: movzbl %al, %ebx +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %r11 +; X86-64-NEXT: movq %rax, %rdi +; X86-64-NEXT: addq %r8, %rdi +; X86-64-NEXT: adcq %rbx, %r11 +; X86-64-NEXT: sarq $63, %rsi +; X86-64-NEXT: imulq %rsi, %rcx +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %r10, %rsi +; X86-64-NEXT: addq %rdx, %rsi +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %r11, %rsi +; X86-64-NEXT: shrq $63, %rsi +; X86-64-NEXT: addq %rax, %rsi +; X86-64-NEXT: leaq (%rsi,%rsi,2), %rax +; X86-64-NEXT: subq %rax, %r9 +; X86-64-NEXT: movq %r9, %rax +; X86-64-NEXT: popq %rbx ; X86-64-NEXT: retq ; ; WIN64-LABEL: mod128: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __modti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: pushq %rdi +; WIN64-NEXT: pushq %rbx +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r10 +; WIN64-NEXT: movabsq $6148914691236517206, %rsi # imm = 0x5555555555555556 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movq %rax, %rcx +; WIN64-NEXT: addq %r9, %rcx +; WIN64-NEXT: adcq $0, %r11 +; WIN64-NEXT: movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555 +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: adcq %r11, %r9 +; WIN64-NEXT: setb %al +; WIN64-NEXT: movzbl %al, %ebx +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movq %rax, %rcx +; WIN64-NEXT: addq %r9, %rcx +; WIN64-NEXT: adcq %rbx, %r11 +; WIN64-NEXT: sarq $63, %r8 +; WIN64-NEXT: imulq %r8, %rdi +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: addq %rdi, %rdx +; WIN64-NEXT: imulq %rsi, %r8 +; WIN64-NEXT: addq %rdx, %r8 +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: adcq %r11, %r8 +; WIN64-NEXT: shrq $63, %r8 +; WIN64-NEXT: addq %rax, %r8 +; WIN64-NEXT: leaq (%r8,%r8,2), %rax +; WIN64-NEXT: subq %rax, %r10 +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: popq %rbx +; WIN64-NEXT: popq %rdi +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq @@ -37,25 +109,89 @@ define i64 @div128(i128 %x) nounwind { ; X86-64-LABEL: div128: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __divti3 -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %r9 +; X86-64-NEXT: movabsq $6148914691236517206, %r10 # imm = 0x5555555555555556 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: movq %rdx, %r8 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: movq %rdx, %r11 +; X86-64-NEXT: movq %rax, %rdi +; X86-64-NEXT: addq %r8, %rdi +; X86-64-NEXT: adcq $0, %r11 +; X86-64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X86-64-NEXT: movq %r9, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %r8 +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %r11, %r8 +; X86-64-NEXT: setb %al +; X86-64-NEXT: movzbl %al, %r11d +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %r9 +; X86-64-NEXT: movq %rax, %rdi +; X86-64-NEXT: addq %r8, %rdi +; X86-64-NEXT: adcq %r11, %r9 +; X86-64-NEXT: sarq $63, %rsi +; X86-64-NEXT: imulq %rsi, %rcx +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %r10 +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %r10, %rsi +; X86-64-NEXT: addq %rdx, %rsi +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %r9, %rsi +; X86-64-NEXT: shrq $63, %rsi +; X86-64-NEXT: addq %rsi, %rax ; X86-64-NEXT: retq ; ; WIN64-LABEL: div128: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __divti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: pushq %rdi +; WIN64-NEXT: pushq %rbx +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r10 +; WIN64-NEXT: movabsq $6148914691236517206, %rbx # imm = 0x5555555555555556 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rbx +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rbx +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movq %rax, %rcx +; WIN64-NEXT: addq %r9, %rcx +; WIN64-NEXT: adcq $0, %r11 +; WIN64-NEXT: movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555 +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: adcq %r11, %r9 +; WIN64-NEXT: setb %al +; WIN64-NEXT: movzbl %al, %esi +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rdi +; WIN64-NEXT: movq %rdx, %r10 +; WIN64-NEXT: movq %rax, %rcx +; WIN64-NEXT: addq %r9, %rcx +; WIN64-NEXT: adcq %rsi, %r10 +; WIN64-NEXT: sarq $63, %r8 +; WIN64-NEXT: imulq %r8, %rdi +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rbx +; WIN64-NEXT: addq %rdi, %rdx +; WIN64-NEXT: imulq %rbx, %r8 +; WIN64-NEXT: addq %rdx, %r8 +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: adcq %r10, %r8 +; WIN64-NEXT: shrq $63, %r8 +; WIN64-NEXT: addq %r8, %rax +; WIN64-NEXT: popq %rbx +; WIN64-NEXT: popq %rdi +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq @@ -67,25 +203,68 @@ define i64 @umod128(i128 %x) nounwind { ; X86-64-LABEL: umod128: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3 -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rsi, %r8 +; X86-64-NEXT: movq %rdi, %r10 +; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %r9 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %rsi +; X86-64-NEXT: movq %rax, %rcx +; X86-64-NEXT: addq %r9, %rcx +; X86-64-NEXT: adcq $0, %rsi +; X86-64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: movq %rdx, %rdi +; X86-64-NEXT: addq %rcx, %rax +; X86-64-NEXT: adcq %rsi, %rdi +; X86-64-NEXT: setb %al +; X86-64-NEXT: movzbl %al, %ecx +; X86-64-NEXT: movq %r8, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %rcx, %rdx +; X86-64-NEXT: shldq $63, %rax, %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax +; X86-64-NEXT: subq %rax, %r10 +; X86-64-NEXT: movq %r10, %rax ; X86-64-NEXT: retq ; ; WIN64-LABEL: umod128: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r10 +; WIN64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rcx +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rcx +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movq %rax, %rcx +; WIN64-NEXT: addq %r9, %rcx +; WIN64-NEXT: adcq $0, %r11 +; WIN64-NEXT: movabsq $-6148914691236517206, %rsi # imm = 0xAAAAAAAAAAAAAAAA +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: adcq %r11, %r9 +; WIN64-NEXT: setb %al +; WIN64-NEXT: movzbl %al, %ecx +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: addq %r9, %rax +; WIN64-NEXT: adcq %rcx, %rdx +; WIN64-NEXT: shldq $63, %rax, %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax +; WIN64-NEXT: subq %rax, %r10 +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq @@ -97,25 +276,61 @@ define i64 @udiv128(i128 %x) nounwind { ; X86-64-LABEL: udiv128: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3 -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rsi, %r8 +; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %r9 +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %rsi +; X86-64-NEXT: movq %rax, %rcx +; X86-64-NEXT: addq %r9, %rcx +; X86-64-NEXT: adcq $0, %rsi +; X86-64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: movq %rdx, %rdi +; X86-64-NEXT: addq %rcx, %rax +; X86-64-NEXT: adcq %rsi, %rdi +; X86-64-NEXT: setb %al +; X86-64-NEXT: movzbl %al, %ecx +; X86-64-NEXT: movq %r8, %rax +; X86-64-NEXT: mulq %r9 +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: adcq %rdx, %rcx +; X86-64-NEXT: shrdq $1, %rcx, %rax ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv128: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: pushq %rsi +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r10 +; WIN64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rcx +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rcx +; WIN64-NEXT: movq %rdx, %r11 +; WIN64-NEXT: movq %rax, %rcx +; WIN64-NEXT: addq %r9, %rcx +; WIN64-NEXT: adcq $0, %r11 +; WIN64-NEXT: movabsq $-6148914691236517206, %rsi # imm = 0xAAAAAAAAAAAAAAAA +; WIN64-NEXT: movq %r10, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: adcq %r11, %r9 +; WIN64-NEXT: setb %al +; WIN64-NEXT: movzbl %al, %ecx +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rsi +; WIN64-NEXT: addq %r9, %rax +; WIN64-NEXT: adcq %rdx, %rcx +; WIN64-NEXT: shrdq $1, %rcx, %rax +; WIN64-NEXT: popq %rsi ; WIN64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll --- a/llvm/test/CodeGen/X86/i128-sdiv.ll +++ b/llvm/test/CodeGen/X86/i128-sdiv.ll @@ -5,11 +5,15 @@ ; Make sure none of these crash, and that the power-of-two transformations ; trigger correctly. -define i128 @test1(i128 %x) nounwind { +define i128 @test1(i128 %x) { ; X86-LABEL: test1: ; X86: # %bb.0: ; X86-NEXT: pushl %edi +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_offset %esi, -12 +; X86-NEXT: .cfi_offset %edi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %edx @@ -30,7 +34,9 @@ ; X86-NEXT: movl %edx, 4(%eax) ; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: popl %edi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl $4 ; ; X64-LABEL: test1: @@ -49,12 +55,18 @@ ret i128 %tmp } -define i128 @test2(i128 %x) nounwind { +define i128 @test2(i128 %x) { ; X86-LABEL: test2: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %edi +; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: .cfi_offset %esi, -16 +; X86-NEXT: .cfi_offset %edi, -12 +; X86-NEXT: .cfi_offset %ebx, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %edx @@ -82,8 +94,11 @@ ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: popl %edi +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: popl %ebx +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl $4 ; ; X64-LABEL: test2: @@ -105,50 +120,277 @@ ret i128 %tmp } -define i128 @test3(i128 %x) nounwind { +define i128 @test3(i128 %x) { ; X86-LABEL: test3: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: pushl %ebx +; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: pushl %edi +; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %esp, %eax -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-5 -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-3 -; X86-NEXT: pushl 24(%ebp) -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl 12(%ebp) -; X86-NEXT: pushl %eax -; X86-NEXT: calll __divti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: .cfi_def_cfa_offset 20 +; X86-NEXT: subl $48, %esp +; X86-NEXT: .cfi_def_cfa_offset 68 +; X86-NEXT: .cfi_offset %esi, -20 +; X86-NEXT: .cfi_offset %edi, -16 +; X86-NEXT: .cfi_offset %ebx, -12 +; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl $-1, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl $1610612735, %edx # imm = 0x5FFFFFFF +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl $1610612735, %edi # imm = 0x5FFFFFFF +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %ebp +; X86-NEXT: setb %bl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) -; X86-NEXT: movl %eax, (%esi) -; X86-NEXT: movl %esi, %eax -; X86-NEXT: leal -8(%ebp), %esp +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl $-1, %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl $1610612735, %edx # imm = 0x5FFFFFFF +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $1610612735, %edx # imm = 0x5FFFFFFF +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movzbl %bl, %edx +; X86-NEXT: adcl %edx, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl $-2147483648, %ebx # imm = 0x80000000 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl $0, %eax +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $-2147483648, %esi # imm = 0x80000000 +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %edi, %eax +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl %esi, %ebp +; X86-NEXT: setb %al +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: addl %ebp, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $-1, %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $1610612735, %edx # imm = 0x5FFFFFFF +; X86-NEXT: mull %edx +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: imull $1610612735, %ecx, %edx # imm = 0x5FFFFFFF +; X86-NEXT: addl %edi, %edx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: sarl %esi +; X86-NEXT: shrdl $1, %eax, %ebp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: addl $48, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: popl %ebx +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl $4 ; ; X64-LABEL: test3: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movq $-3, %rdx -; X64-NEXT: movq $-5, %rcx -; X64-NEXT: callq __divti3 -; X64-NEXT: popq %rcx +; X64-NEXT: pushq %r14 +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: pushq %rbx +; X64-NEXT: .cfi_def_cfa_offset 24 +; X64-NEXT: .cfi_offset %rbx, -24 +; X64-NEXT: .cfi_offset %r14, -16 +; X64-NEXT: movq %rsi, %r10 +; X64-NEXT: movq %rdi, %r9 +; X64-NEXT: movabsq $6917529027641081855, %r14 # imm = 0x5FFFFFFFFFFFFFFF +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r8, %rdi +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: adcq %rcx, %rsi +; X64-NEXT: setb %al +; X64-NEXT: movzbl %al, %ecx +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rsi, %rdi +; X64-NEXT: adcq %rcx, %r11 +; X64-NEXT: andq %r10, %rbx +; X64-NEXT: movq %r10, %rsi +; X64-NEXT: sarq $63, %rsi +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rbx, %rdx +; X64-NEXT: imulq %r14, %rsi +; X64-NEXT: addq %rdx, %rsi +; X64-NEXT: movq $-1, %rcx +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: subq %r9, %rcx +; X64-NEXT: subq %r10, %rcx +; X64-NEXT: addq %r8, %rax +; X64-NEXT: adcq %rsi, %rcx +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: adcq %r11, %rcx +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: sarq $63, %rdx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shrq $63, %rax +; X64-NEXT: sarq %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: adcq $0, %rdx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: popq %rbx +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: popq %r14 +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq %tmp = sdiv i128 %x, -73786976294838206467 ret i128 %tmp diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll --- a/llvm/test/CodeGen/X86/i128-udiv.ll +++ b/llvm/test/CodeGen/X86/i128-udiv.ll @@ -33,46 +33,74 @@ ; X86-LABEL: test2: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %esp, %eax -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-4 -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $0 -; X86-NEXT: pushl 24(%ebp) -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl 12(%ebp) ; X86-NEXT: pushl %eax -; X86-NEXT: calll __udivti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl (%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: shrdl $2, %esi, %ecx +; X86-NEXT: movl $4, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: shrl $2, %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl $17, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: leal -8(%ebp), %esp +; X86-NEXT: mull %ebx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebp, (%eax) +; X86-NEXT: setb %cl +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl $0, 12(%eax) +; X86-NEXT: movl $0, 8(%eax) +; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: test2: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: movq $-4, %rcx -; X64-NEXT: callq __udivti3 -; X64-NEXT: popq %rcx +; X64-NEXT: shrq $2, %rsi +; X64-NEXT: movl $17, %ecx +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r8, %rcx +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movl $4, %r8d +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: adcq %rdi, %rsi +; X64-NEXT: setb %al +; X64-NEXT: movzbl %al, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: mulq %r8 +; X64-NEXT: addq %rsi, %rax +; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: retq %tmp = udiv i128 %x, -73786976294838206464 ret i128 %tmp @@ -82,46 +110,131 @@ ; X86-LABEL: test3: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %esp, %eax -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-5 -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl $-3 -; X86-NEXT: pushl 24(%ebp) -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl 12(%ebp) -; X86-NEXT: pushl %eax -; X86-NEXT: calll __udivti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: subl $24, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%esi) -; X86-NEXT: movl %edx, 8(%esi) -; X86-NEXT: movl %ecx, 4(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl $1073741824, %ebx # imm = 0x40000000 +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl $1073741824, %ebx # imm = 0x40000000 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl %edx, %edi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %edx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %eax, %edx +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl $0, %eax +; X86-NEXT: setb %bl +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl $1073741824, %edx # imm = 0x40000000 +; X86-NEXT: mull %edx +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: movl $5, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %eax -; X86-NEXT: leal -8(%ebp), %esp +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $5, %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $5, %edx +; X86-NEXT: mull %edx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: setb %al +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl {{[0-9]+}}(%esp), %edx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: setb %dl +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: addb $255, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: addb $255, %dl +; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addb $255, %al +; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: setb %al +; X86-NEXT: addl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: shrl $30, %ecx +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl $0, 12(%eax) +; X86-NEXT: movl $0, 8(%eax) +; X86-NEXT: movl $0, 4(%eax) +; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: test3: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movq $-3, %rdx -; X64-NEXT: movq $-5, %rcx -; X64-NEXT: callq __udivti3 -; X64-NEXT: popq %rcx +; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: movl $5, %ecx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r9, %rcx +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movabsq $4611686018427387905, %r9 # imm = 0x4000000000000001 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: adcq %rsi, %rdi +; X64-NEXT: setb %al +; X64-NEXT: movzbl %al, %ecx +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: adcq %rdx, %rcx +; X64-NEXT: shrq $62, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq %tmp = udiv i128 %x, -73786976294838206467 ret i128 %tmp diff --git a/llvm/test/CodeGen/X86/pr44812.ll b/llvm/test/CodeGen/X86/pr44812.ll --- a/llvm/test/CodeGen/X86/pr44812.ll +++ b/llvm/test/CodeGen/X86/pr44812.ll @@ -4,18 +4,28 @@ define <2 x i32> @foo(<2 x i32> %tmp) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: leal 7(%eax), %ecx -; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: cmovnsl %eax, %ecx -; CHECK-NEXT: sarl $3, %ecx +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl $-2147483647, %edx # imm = 0x80000001 +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: imull %edx +; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: addl %ecx, %esi +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: sarl $2, %esi +; CHECK-NEXT: addl %eax, %esi ; CHECK-NEXT: movl $1717986919, %eax # imm = 0x66666667 ; CHECK-NEXT: imull {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shrl $31, %eax ; CHECK-NEXT: sarl $2, %edx ; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl entry: %tmp1 = sdiv <2 x i32> %tmp, diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -56,28 +56,75 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { ; X86-LABEL: vrolq_extract_udiv: ; X86: # %bb.0: -; X86-NEXT: subl $44, %esp -; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vmovss %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $3, {{[0-9]+}}(%esp) -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vextractps $2, %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $3, {{[0-9]+}}(%esp) -; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: vmovd %xmm0, %ecx +; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: vpextrd $1, %xmm0, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: setb %al +; X86-NEXT: movzbl %al, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: mull %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: shrl %ecx +; X86-NEXT: shldl $31, %eax, %edx +; X86-NEXT: vmovd %edx, %xmm1 +; X86-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; X86-NEXT: vpextrd $2, %xmm0, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: vpextrd $3, %xmm0, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $-1431655766, %ecx # imm = 0xAAAAAAAA +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: setb %al +; X86-NEXT: movzbl %al, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: mull %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: shrdl $1, %edx, %eax +; X86-NEXT: vpinsrd $2, %eax, %xmm1, %xmm0 +; X86-NEXT: shrl %edx ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ; X86-NEXT: vprolq $57, %zmm0, %zmm0 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; X86-NEXT: addl $44, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -207,46 +254,121 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { ; X86-LABEL: no_extract_udiv: ; X86: # %bb.0: -; X86-NEXT: subl $60, %esp -; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vmovss %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $3, {{[0-9]+}}(%esp) -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vextractps $2, %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $3, {{[0-9]+}}(%esp) -; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vmovss %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vextractps $2, %xmm0, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 -; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload -; X86-NEXT: vpsllq $56, %xmm1, %xmm1 -; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 -; X86-NEXT: addl $60, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $20, %esp +; X86-NEXT: vpextrd $2, %xmm0, %ecx +; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: vpextrd $3, %xmm0, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: imull $-1431655766, %esi, %eax # imm = 0xAAAAAAAA +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: vmovd %xmm0, %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl $-1431655765, %ecx # imm = 0xAAAAAAAB +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: vpextrd $1, %xmm0, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: mull %edx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: imull $-1431655766, %edi, %eax # imm = 0xAAAAAAAA +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-602410997, %esi # imm = 0xDC17F00B +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl $-1439092939, %ecx # imm = 0xAA392F35 +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl $-1439092939, %edx # imm = 0xAA392F35 +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: shldl $23, %eax, %esi +; X86-NEXT: shrl $9, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $-602410997, %edi # imm = 0xDC17F00B +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $-1439092939, %ecx # imm = 0xAA392F35 +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: vmovd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; X86-NEXT: # xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; X86-NEXT: vmovd %esi, %xmm1 +; X86-NEXT: vpinsrd $1, %ebp, %xmm1, %xmm1 +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl $-1439092939, %edx # imm = 0xAA392F35 +; X86-NEXT: mull %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: shrdl $9, %edx, %eax +; X86-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X86-NEXT: shrl $9, %edx +; X86-NEXT: vpinsrd $3, %edx, %xmm1, %xmm1 +; X86-NEXT: vpsllq $55, %xmm0, %xmm0 +; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X86-NEXT: addl $20, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: no_extract_udiv: @@ -261,7 +383,8 @@ ; X64-NEXT: mulq %rdi ; X64-NEXT: vmovq %rdx, %xmm0 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: vpsrlq $1, %xmm0, %xmm0 +; X64-NEXT: vpsllq $55, %xmm0, %xmm0 +; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rdi @@ -271,7 +394,6 @@ ; X64-NEXT: vmovq %rdx, %xmm2 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; X64-NEXT: vpsrlq $9, %xmm1, %xmm1 -; X64-NEXT: vpsllq $56, %xmm0, %xmm0 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq %lhs_div = udiv <2 x i64> %i, diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -232,31 +232,31 @@ ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull $171, %eax, %ecx +; X86-NEXT: shlb $3, %ch +; X86-NEXT: andb $-16, %ch ; X86-NEXT: imull $79, %eax, %edx ; X86-NEXT: subb %dh, %al ; X86-NEXT: shrb %al ; X86-NEXT: addb %dh, %al ; X86-NEXT: shrb $5, %al -; X86-NEXT: shlb $3, %ch -; X86-NEXT: orb %al, %ch -; X86-NEXT: andb $-9, %ch -; X86-NEXT: movb %ch, %al +; X86-NEXT: orb %ch, %al +; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: no_extract_udiv: ; X64: # %bb.0: -; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: imull $171, %ecx, %eax -; X64-NEXT: shrl $8, %eax -; X64-NEXT: imull $79, %ecx, %edx +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: imull $171, %eax, %ecx +; X64-NEXT: shrl $8, %ecx +; X64-NEXT: shlb $3, %cl +; X64-NEXT: andb $-16, %cl +; X64-NEXT: imull $79, %eax, %edx ; X64-NEXT: shrl $8, %edx -; X64-NEXT: subb %dl, %cl -; X64-NEXT: shrb %cl -; X64-NEXT: addb %dl, %cl -; X64-NEXT: shrb $5, %cl -; X64-NEXT: shlb $3, %al +; X64-NEXT: subb %dl, %al +; X64-NEXT: shrb %al +; X64-NEXT: addb %dl, %al +; X64-NEXT: shrb $5, %al ; X64-NEXT: orb %cl, %al -; X64-NEXT: andb $-9, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %lhs_div = udiv i8 %i, 3 diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll --- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -6,103 +6,34 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; SSE-LABEL: fold_srem_vec_1: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $9, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; SSE-NEXT: shrl $16, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: movswl %dx, %esi -; SSE-NEXT: shrl $15, %edx -; SSE-NEXT: sarl $6, %esi -; SSE-NEXT: addl %edx, %esi -; SSE-NEXT: imull $95, %esi, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $21, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $-124, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $18, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $98, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = <1,0,0,65535,u,u,u,u> +; SSE-NEXT: pmullw %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrlw $15, %xmm1 +; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fold_srem_vec_1: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $9, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; AVX-NEXT: shrl $16, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: movswl %dx, %esi -; AVX-NEXT: shrl $15, %edx -; AVX-NEXT: sarl $6, %esi -; AVX-NEXT: addl %edx, %esi -; AVX-NEXT: imull $95, %esi, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $21, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: imull $-124, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $18, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: imull $98, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 -; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm2 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -111,8 +42,10 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; SSE-LABEL: fold_srem_vec_2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhw %xmm0, %xmm1 +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psrlw $15, %xmm2 @@ -122,16 +55,34 @@ ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fold_srem_vec_2: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: fold_srem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_srem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294945911,4294945911,4294945911,4294945911] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX2-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } @@ -141,30 +92,51 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; SSE-LABEL: combine_srem_sdiv: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhw %xmm0, %xmm1 +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psrlw $15, %xmm2 ; SSE-NEXT: psraw $6, %xmm1 ; SSE-NEXT: paddw %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = <95,95,95,95,u,u,u,u> ; SSE-NEXT: pmullw %xmm1, %xmm2 ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_srem_sdiv: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 -; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_srem_sdiv: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_srem_sdiv: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294945911,4294945911,4294945911,4294945911] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX2-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX2-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, %3 = add <4 x i16> %1, %2 @@ -175,79 +147,31 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; SSE-LABEL: dont_fold_srem_power_of_two: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: leal 31(%rax), %ecx -; SSE-NEXT: testw %ax, %ax -; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-32, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: leal 63(%rcx), %edx -; SSE-NEXT: testw %cx, %cx -; SSE-NEXT: cmovnsl %ecx, %edx -; SSE-NEXT: andl $-64, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: leal 7(%rax), %ecx -; SSE-NEXT: testw %ax, %ax -; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-8, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $6, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: imull $95, %edx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlw $15, %xmm2 +; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_srem_power_of_two: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: leal 31(%rax), %ecx -; AVX-NEXT: testw %ax, %ax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-32, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: leal 63(%rcx), %edx -; AVX-NEXT: testw %cx, %cx -; AVX-NEXT: cmovnsl %ecx, %edx -; AVX-NEXT: andl $-64, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: leal 7(%rax), %ecx -; AVX-NEXT: testw %ax, %ax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-8, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $6, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: imull $95, %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -257,83 +181,40 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; SSE-LABEL: dont_fold_srem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; SSE-NEXT: movl %ecx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $23, %ecx -; SSE-NEXT: addl %esi, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; SSE-NEXT: movl %ecx, %edx -; SSE-NEXT: shrl $31, %edx -; SSE-NEXT: sarl $26, %ecx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4,5,6,7] +; SSE-NEXT: pmovsxwd %xmm0, %xmm3 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: packusdw %xmm3, %xmm3 +; SSE-NEXT: paddw %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = +; SSE-NEXT: pmulhw %xmm3, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] +; SSE-NEXT: psrlw $15, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5,6,7] +; SSE-NEXT: paddw %xmm2, %xmm3 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm3 +; SSE-NEXT: psubw %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_srem_one: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $23, %ecx -; AVX-NEXT: addl %esi, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $26, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] +; AVX-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm2, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3,4,5,6,7] +; AVX-NEXT: vpsrlw $15, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -343,77 +224,40 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_i16_smax: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: leal 32767(%rax), %ecx -; SSE-NEXT: testw %ax, %ax -; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000 -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; SSE-NEXT: movl %ecx, %edx -; SSE-NEXT: shrl $31, %edx -; SSE-NEXT: sarl $26, %ecx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = <1,65535,1,0,u,u,u,u> +; SSE-NEXT: pmullw %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = +; SSE-NEXT: pmulhw %xmm2, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] +; SSE-NEXT: psrlw $15, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: paddw %xmm1, %xmm3 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm3 +; SSE-NEXT: psubw %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_urem_i16_smax: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: leal 32767(%rax), %ecx -; AVX-NEXT: testw %ax, %ax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000 -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $26, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm2 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpsrlw $15, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3],xmm3[4,5,6,7] +; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -423,133 +267,153 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; SSE-LABEL: dont_fold_srem_i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movq %xmm1, %rcx -; SSE-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: imulq %rdx -; SSE-NEXT: addq %rcx, %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: shrq $63, %rax -; SSE-NEXT: sarq $4, %rdx -; SSE-NEXT: addq %rax, %rdx -; SSE-NEXT: leaq (%rdx,%rdx,2), %rax -; SSE-NEXT: shlq $3, %rax -; SSE-NEXT: subq %rax, %rdx -; SSE-NEXT: addq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: pextrq $1, %xmm2, %rcx -; SSE-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: imulq %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: shrq $63, %rax -; SSE-NEXT: sarq $11, %rdx -; SSE-NEXT: addq %rax, %rdx -; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm2 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: pextrq $1, %xmm0, %rcx -; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: imulq %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: shrq $63, %rax -; SSE-NEXT: sarq $8, %rdx -; SSE-NEXT: addq %rax, %rdx -; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: pextrq $1, %xmm1, %rax +; SSE-NEXT: movabsq $6966426675817289639, %rcx # imm = 0x60ADB826E5E517A7 +; SSE-NEXT: imulq %rcx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movabsq $-5614226457215950491, %rcx # imm = 0xB21642C8590B2165 +; SSE-NEXT: imulq %rcx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero +; SSE-NEXT: paddq %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrlq $11, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrlq $4, %xmm4 +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [576460752303423488,4503599627370496] +; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: psrlq $63, %xmm2 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: psubq %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [23,5423] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: psubq %xmm2, %xmm1 +; SSE-NEXT: pextrq $1, %xmm0, %rax +; SSE-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; SSE-NEXT: imulq %rcx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; SSE-NEXT: movq {{.*#+}} xmm3 = xmm0[0],zero +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrlq $8, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,36028797018963968] +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: psrlq $63, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: paddq %xmm2, %xmm5 +; SSE-NEXT: psubq %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,654] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm2, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: paddq %xmm3, %xmm5 +; SSE-NEXT: psubq %xmm5, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: dont_fold_srem_i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rdx -; AVX1-NEXT: addq %rcx, %rdx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq $4, %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: leaq (%rdx,%rdx,2), %rax -; AVX1-NEXT: shlq $3, %rax -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: movabsq $6966426675817289639, %rcx # imm = 0x60ADB826E5E517A7 +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: movabsq $-5614226457215950491, %rcx # imm = 0xB21642C8590B2165 +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $11, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $4, %xmm2, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [576460752303423488,4503599627370496] +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [23,5423] +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; AVX1-NEXT: imulq %rcx ; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rdx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq $11, %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rdx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq $8, %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $8, %xmm2, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,36028797018963968] +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,654] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: dont_fold_srem_i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rdx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq $4, %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: leaq (%rdx,%rdx,2), %rax -; AVX2-NEXT: shlq $3, %rax -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: movabsq $6966426675817289639, %rcx # imm = 0x60ADB826E5E517A7 +; AVX2-NEXT: imulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: movabsq $-5614226457215950491, %rcx # imm = 0xB21642C8590B2165 +; AVX2-NEXT: imulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; AVX2-NEXT: imulq %rcx ; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rdx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq $11, %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rdx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq $8, %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $63, %ymm1, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [9223372036854775808,36028797018963968,576460752303423488,4503599627370496] +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,654,23,5423] +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %1 = srem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll @@ -295,17 +295,51 @@ define i1 @t64_3_2(i64 %X) nounwind { ; X86-LABEL: t64_3_2: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll __umoddi3 -; X86-NEXT: addl $16, %esp -; X86-NEXT: xorl $2, %eax -; X86-NEXT: orl %edx, %eax +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: setb %al +; X86-NEXT: movzbl %al, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: shrdl $1, %ebx, %eax +; X86-NEXT: movl $3, %edx +; X86-NEXT: mull %edx +; X86-NEXT: shrl %ebx +; X86-NEXT: leal (%ebx,%ebx,2), %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: xorl $2, %ecx +; X86-NEXT: orl %esi, %ecx ; X86-NEXT: sete %al -; X86-NEXT: addl $12, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: t64_3_2: diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll --- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -6,81 +6,42 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; SSE-LABEL: fold_urem_vec_1: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrl $2, %ecx -; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 -; SSE-NEXT: shrl $19, %ecx -; SSE-NEXT: imull $124, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movzwl %cx, %edx -; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 -; SSE-NEXT: shrl $22, %edx -; SSE-NEXT: imull $95, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrl %ecx -; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; SSE-NEXT: shrl $17, %ecx -; SSE-NEXT: imull $98, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: shrl %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: shrl $9, %edx -; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = +; SSE-NEXT: pmulhuw %xmm0, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; SSE-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psubw %xmm1, %xmm2 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm2 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: packusdw %xmm2, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm2 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fold_urem_vec_1: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $2, %ecx -; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 -; AVX-NEXT: shrl $19, %ecx -; AVX-NEXT: imull $124, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movzwl %cx, %edx -; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 -; AVX-NEXT: shrl $22, %edx -; AVX-NEXT: imull $95, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl %ecx -; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; AVX-NEXT: shrl $17, %ecx -; AVX-NEXT: imull $98, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $9, %edx -; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -89,20 +50,34 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; SSE-LABEL: fold_urem_vec_2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhuw %xmm0, %xmm1 -; SSE-NEXT: psrlw $6, %xmm1 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $22, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fold_urem_vec_2: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: fold_urem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $22, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_urem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [44151,44151,44151,44151] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrld $22, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } @@ -112,23 +87,38 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; SSE-LABEL: combine_urem_udiv: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhuw %xmm0, %xmm1 -; SSE-NEXT: psrlw $6, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $22, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = <95,95,95,95,u,u,u,u> ; SSE-NEXT: pmullw %xmm1, %xmm2 ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_urem_udiv: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 -; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_urem_udiv: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $22, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_urem_udiv: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [44151,44151,44151,44151] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrld $22, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX2-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, %3 = add <4 x i16> %1, %2 @@ -139,41 +129,27 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_power_of_two: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; SSE-NEXT: shrl $22, %ecx -; SSE-NEXT: imull $95, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: andl $31, %ecx -; SSE-NEXT: movd %xmm0, %edx -; SSE-NEXT: andl $63, %edx -; SSE-NEXT: movd %edx, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: andl $7, %ecx -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlw $6, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_urem_power_of_two: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; AVX-NEXT: shrl $22, %ecx -; AVX-NEXT: imull $95, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: andl $31, %ecx -; AVX-NEXT: vmovd %xmm0, %edx -; AVX-NEXT: andl $63, %edx -; AVX-NEXT: vmovd %edx, %xmm1 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: andl $7, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 -; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $6, %xmm1, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -183,65 +159,39 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: shrl %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: shrl $4, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; SSE-NEXT: shrl $25, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; SSE-NEXT: shrl $26, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packusdw %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psubw %xmm1, %xmm2 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm2 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: packusdw %xmm2, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_urem_one: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $4, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; AVX-NEXT: shrl $25, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; AVX-NEXT: shrl $26, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -260,119 +210,135 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { ; SSE-LABEL: dont_fold_urem_i64: ; SSE: # %bb.0: -; SSE-NEXT: movq %xmm1, %rcx -; SSE-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: mulq %rdx -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: subq %rdx, %rax +; SSE-NEXT: pextrq $1, %xmm1, %rax +; SSE-NEXT: movabsq $-4513890722074972339, %rcx # imm = 0xC15B704DCBCA2F4D +; SSE-NEXT: mulq %rcx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movabsq $7218291159277650633, %rcx # imm = 0x642C8590B21642C9 +; SSE-NEXT: mulq %rcx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psubq %xmm3, %xmm4 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movq %xmm4, %rax ; SSE-NEXT: shrq %rax -; SSE-NEXT: addq %rdx, %rax -; SSE-NEXT: shrq $4, %rax -; SSE-NEXT: leaq (%rax,%rax,2), %rdx -; SSE-NEXT: shlq $3, %rdx -; SSE-NEXT: subq %rdx, %rax -; SSE-NEXT: addq %rcx, %rax ; SSE-NEXT: movq %rax, %xmm2 -; SSE-NEXT: pextrq $1, %xmm1, %rcx -; SSE-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: mulq %rdx -; SSE-NEXT: shrq $12, %rdx -; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: pextrq $1, %xmm0, %rcx -; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: paddq %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrlq $12, %xmm3 +; SSE-NEXT: psrlq $4, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [23,5423] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: psubq %xmm2, %xmm1 +; SSE-NEXT: pextrq $1, %xmm0, %rax ; SSE-NEXT: shrq %rax -; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; SSE-NEXT: mulq %rdx -; SSE-NEXT: shrq $7, %rdx -; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; SSE-NEXT: mulq %rcx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE-NEXT: psrlq $7, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,654] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: psubq %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: dont_fold_urem_i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rdx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: addq %rdx, %rax -; AVX1-NEXT: shrq $4, %rax -; AVX1-NEXT: leaq (%rax,%rax,2), %rdx -; AVX1-NEXT: shlq $3, %rdx -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rdx -; AVX1-NEXT: shrq $12, %rdx -; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: movabsq $-4513890722074972339, %rcx # imm = 0xC15B704DCBCA2F4D +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: movabsq $7218291159277650633, %rcx # imm = 0x642C8590B21642C9 +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vmovq %xmm3, %rax ; AVX1-NEXT: shrq %rax -; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; AVX1-NEXT: mulq %rdx -; AVX1-NEXT: shrq $7, %rdx -; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlq $12, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlq $4, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm3, %rax +; AVX1-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-NEXT: vpsrlq $7, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [23,5423] +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,654] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: dont_fold_urem_i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rdx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: subq %rdx, %rax +; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: movabsq $-4513890722074972339, %rcx # imm = 0xC15B704DCBCA2F4D +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vmovq %xmm2, %rax +; AVX2-NEXT: movabsq $7218291159277650633, %rcx # imm = 0x642C8590B21642C9 +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm2[0],xmm3[0] +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm2, %rax ; AVX2-NEXT: shrq %rax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: shrq $4, %rax -; AVX2-NEXT: leaq (%rax,%rax,2), %rdx -; AVX2-NEXT: shlq $3, %rdx -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rdx -; AVX2-NEXT: shrq $12, %rdx -; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; AVX2-NEXT: mulq %rdx -; AVX2-NEXT: shrq $7, %rdx -; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,654,23,5423] +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %1 = urem <4 x i64> %x, ret <4 x i64> %1 -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -15,21 +15,22 @@ ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 ; SSE2-NEXT: imulq %rcx -; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: shrq $63, %rax -; SSE2-NEXT: sarq %rdx -; SSE2-NEXT: addq %rax, %rdx ; SSE2-NEXT: movq %rdx, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: imulq %rcx -; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: shrq $63, %rax -; SSE2-NEXT: sarq %rdx -; SSE2-NEXT: addq %rax, %rdx ; SSE2-NEXT: movq %rdx, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrad $1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlq $1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: psrlq $63, %xmm1 +; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_div7_2i64: @@ -37,40 +38,69 @@ ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 ; SSE41-NEXT: imulq %rcx -; SSE41-NEXT: movq %rdx, %rax -; SSE41-NEXT: shrq $63, %rax -; SSE41-NEXT: sarq %rdx -; SSE41-NEXT: addq %rax, %rdx ; SSE41-NEXT: movq %rdx, %xmm1 ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: imulq %rcx -; SSE41-NEXT: movq %rdx, %rax -; SSE41-NEXT: shrq $63, %rax -; SSE41-NEXT: sarq %rdx -; SSE41-NEXT: addq %rax, %rdx ; SSE41-NEXT: movq %rdx, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrad $1, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrlq $1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE41-NEXT: psrlq $63, %xmm0 +; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: test_div7_2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 -; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: test_div7_2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpsrad $1, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2NOBW-LABEL: test_div7_2i64: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpextrq $1, %xmm0, %rax +; AVX2NOBW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm1 +; AVX2NOBW-NEXT: vmovq %xmm0, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm0 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2NOBW-NEXT: vpsrad $1, %xmm0, %xmm1 +; AVX2NOBW-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX2NOBW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2NOBW-NEXT: vpsrlq $63, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_div7_2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vpsrlq $63, %xmm0, %xmm1 +; AVX512BW-NEXT: vpsraq $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = sdiv <2 x i64> %a, ret <2 x i64> %res } @@ -415,89 +445,111 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: test_rem7_2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: imulq %rsi -; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: shrq $63, %rax -; SSE2-NEXT: sarq %rdx -; SSE2-NEXT: addq %rax, %rdx -; SSE2-NEXT: leaq (,%rdx,8), %rax -; SSE2-NEXT: subq %rax, %rdx -; SSE2-NEXT: addq %rcx, %rdx +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; SSE2-NEXT: imulq %rcx ; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: imulq %rsi -; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: shrq $63, %rax -; SSE2-NEXT: sarq %rdx -; SSE2-NEXT: addq %rax, %rdx -; SSE2-NEXT: leaq (,%rdx,8), %rax -; SSE2-NEXT: subq %rax, %rdx -; SSE2-NEXT: addq %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: imulq %rcx +; SSE2-NEXT: movq %rdx, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrlq $1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: psrlq $63, %xmm1 +; SSE2-NEXT: paddq %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psllq $3, %xmm2 +; SSE2-NEXT: psubq %xmm2, %xmm1 +; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rcx -; SSE41-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: imulq %rsi -; SSE41-NEXT: movq %rdx, %rax -; SSE41-NEXT: shrq $63, %rax -; SSE41-NEXT: sarq %rdx -; SSE41-NEXT: addq %rax, %rdx -; SSE41-NEXT: leaq (,%rdx,8), %rax -; SSE41-NEXT: subq %rax, %rdx -; SSE41-NEXT: addq %rcx, %rdx +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; SSE41-NEXT: imulq %rcx ; SSE41-NEXT: movq %rdx, %xmm1 -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: imulq %rsi -; SSE41-NEXT: movq %rdx, %rax -; SSE41-NEXT: shrq $63, %rax -; SSE41-NEXT: sarq %rdx -; SSE41-NEXT: addq %rax, %rdx -; SSE41-NEXT: leaq (,%rdx,8), %rax -; SSE41-NEXT: subq %rax, %rdx -; SSE41-NEXT: addq %rcx, %rdx -; SSE41-NEXT: movq %rdx, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: imulq %rcx +; SSE41-NEXT: movq %rdx, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrad $1, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlq $1, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; SSE41-NEXT: psrlq $63, %xmm2 +; SSE41-NEXT: paddq %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psllq $3, %xmm1 +; SSE41-NEXT: psubq %xmm1, %xmm2 +; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: test_rem7_2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: test_rem7_2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsllq $3, %xmm1, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2NOBW-LABEL: test_rem7_2i64: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpextrq $1, %xmm0, %rax +; AVX2NOBW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm1 +; AVX2NOBW-NEXT: vmovq %xmm0, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm2 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2NOBW-NEXT: vpsrad $1, %xmm1, %xmm2 +; AVX2NOBW-NEXT: vpsrlq $1, %xmm1, %xmm3 +; AVX2NOBW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] +; AVX2NOBW-NEXT: vpsrlq $63, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX2NOBW-NEXT: vpsllq $3, %xmm1, %xmm2 +; AVX2NOBW-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512BW-NEXT: vpsrlq $63, %xmm1, %xmm2 +; AVX512BW-NEXT: vpsraq $1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsllq $3, %xmm1, %xmm2 +; AVX512BW-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = srem <2 x i64> %a, ret <2 x i64> %res } diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -14,73 +14,80 @@ ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 ; AVX1-NEXT: imulq %rcx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: imulq %rcx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: imulq %rcx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm2 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: imulq %rcx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpsrad $1, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_div7_4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 -; AVX2-NEXT: imulq %rcx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: imulq %rcx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: imulq %rcx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: imulq %rcx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_div7_4i64: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2NOBW-NEXT: vpextrq $1, %xmm1, %rax +; AVX2NOBW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm2 +; AVX2NOBW-NEXT: vmovq %xmm1, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm1 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2NOBW-NEXT: vpextrq $1, %xmm0, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm2 +; AVX2NOBW-NEXT: vmovq %xmm0, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm0 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2NOBW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpsrad $1, %ymm0, %ymm1 +; AVX2NOBW-NEXT: vpsrlq $1, %ymm0, %ymm2 +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2NOBW-NEXT: vpsrlq $63, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_div7_4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax +; AVX512BW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vmovq %xmm1, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlq $63, %ymm0, %ymm1 +; AVX512BW-NEXT: vpsraq $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq %res = sdiv <4 x i64> %a, ret <4 x i64> %res } @@ -351,108 +358,95 @@ ; AVX1-LABEL: test_rem7_4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rsi -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: leaq (,%rdx,8), %rax -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX1-NEXT: imulq %rcx ; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rsi -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: leaq (,%rdx,8), %rax -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: addq %rcx, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rsi -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: leaq (,%rdx,8), %rax -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpsrad $1, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllq $3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: imulq %rcx ; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: imulq %rsi -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: leaq (,%rdx,8), %rax -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: addq %rcx, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpsrad $1, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllq $3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rsi -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: leaq (,%rdx,8), %rax -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rsi -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: leaq (,%rdx,8), %rax -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rsi -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: leaq (,%rdx,8), %rax -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: imulq %rsi -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: leaq (,%rdx,8), %rax -; AVX2-NEXT: subq %rax, %rdx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_4i64: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2NOBW-NEXT: vpextrq $1, %xmm1, %rax +; AVX2NOBW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm2 +; AVX2NOBW-NEXT: vmovq %xmm1, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm1 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2NOBW-NEXT: vpextrq $1, %xmm0, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm2 +; AVX2NOBW-NEXT: vmovq %xmm0, %rax +; AVX2NOBW-NEXT: imulq %rcx +; AVX2NOBW-NEXT: vmovq %rdx, %xmm3 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2NOBW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2NOBW-NEXT: vpsrad $1, %ymm1, %ymm2 +; AVX2NOBW-NEXT: vpsrlq $1, %ymm1, %ymm3 +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2NOBW-NEXT: vpsrlq $63, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2NOBW-NEXT: vpsllq $3, %ymm1, %ymm2 +; AVX2NOBW-NEXT: vpsubq %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax +; AVX512BW-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vmovq %xmm1, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm2 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: vmovq %rdx, %xmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vpsrlq $63, %ymm1, %ymm2 +; AVX512BW-NEXT: vpsraq $1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpsllq $3, %ymm1, %ymm2 +; AVX512BW-NEXT: vpsubq %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq %res = srem <4 x i64> %a, ret <4 x i64> %res } diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -9,73 +9,44 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind { ; AVX-LABEL: test_div7_8i64: ; AVX: # %bb.0: -; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX-NEXT: vpextrq $1, %xmm1, %rax -; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 -; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm2 -; AVX-NEXT: vmovq %xmm1, %rax -; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 ; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm2, %rax ; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: imulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm1, %rax +; AVX-NEXT: imulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpextrq $1, %xmm2, %rax ; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm2, %rax ; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX-NEXT: vpextrq $1, %xmm0, %rax ; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: imulq %rcx -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-NEXT: vpsrlq $63, %zmm0, %zmm1 +; AVX-NEXT: vpsraq $1, %zmm0, %zmm0 +; AVX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX-NEXT: retq %res = sdiv <8 x i64> %a, ret <8 x i64> %res @@ -291,105 +262,47 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-LABEL: test_rem7_8i64: ; AVX: # %bb.0: -; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX-NEXT: vpextrq $1, %xmm1, %rcx -; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm2 -; AVX-NEXT: vmovq %xmm1, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; AVX-NEXT: imulq %rcx ; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %xmm2, %rax +; AVX-NEXT: imulq %rcx ; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: imulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm1, %rax +; AVX-NEXT: imulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: imulq %rcx ; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vmovq %xmm2, %rax +; AVX-NEXT: imulq %rcx ; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: imulq %rcx ; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi -; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: imulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm4 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX-NEXT: vpsrlq $63, %zmm1, %zmm2 +; AVX-NEXT: vpsraq $1, %zmm1, %zmm1 +; AVX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX-NEXT: vpsllq $3, %zmm1, %zmm2 +; AVX-NEXT: vpsubq %zmm2, %zmm1, %zmm1 +; AVX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX-NEXT: retq %res = srem <8 x i64> %a, ret <8 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -12,66 +12,50 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: test_div7_2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: subq %rdx, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: addq %rdx, %rcx -; SSE2-NEXT: movq %rcx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: subq %rdx, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: addq %rdx, %rcx -; SSE2-NEXT: movq %rcx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: psrlq $2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; SSE2-NEXT: mulq %rcx +; SSE2-NEXT: movq %rdx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: mulq %rcx +; SSE2-NEXT: movq %rdx, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: psrlq $2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_div7_2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rcx -; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %rsi -; SSE41-NEXT: subq %rdx, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: addq %rdx, %rcx -; SSE41-NEXT: movq %rcx, %xmm1 -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %rsi -; SSE41-NEXT: subq %rdx, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: addq %rdx, %rcx -; SSE41-NEXT: movq %rcx, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: movq %rdx, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: movq %rdx, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE41-NEXT: psubq %xmm2, %xmm0 +; SSE41-NEXT: psrlq $1, %xmm0 +; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: psrlq $2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_div7_2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm2 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlq $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlq $2, %xmm0, %xmm0 ; AVX-NEXT: retq %res = udiv <2 x i64> %a, @@ -430,94 +414,66 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: test_rem7_2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: subq %rdx, %rax -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: addq %rdx, %rax -; SSE2-NEXT: shrq $2, %rax -; SSE2-NEXT: leaq (,%rax,8), %rdx -; SSE2-NEXT: subq %rdx, %rax -; SSE2-NEXT: addq %rcx, %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: subq %rdx, %rax -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: addq %rdx, %rax -; SSE2-NEXT: shrq $2, %rax -; SSE2-NEXT: leaq (,%rax,8), %rdx -; SSE2-NEXT: subq %rdx, %rax -; SSE2-NEXT: addq %rcx, %rax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; SSE2-NEXT: mulq %rcx +; SSE2-NEXT: movq %rdx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: mulq %rcx +; SSE2-NEXT: movq %rdx, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psubq %xmm2, %xmm1 +; SSE2-NEXT: psrlq $1, %xmm1 +; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: psrlq $2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psllq $3, %xmm2 +; SSE2-NEXT: psubq %xmm2, %xmm1 +; SSE2-NEXT: paddq %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rcx -; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %rsi -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: subq %rdx, %rax -; SSE41-NEXT: shrq %rax -; SSE41-NEXT: addq %rdx, %rax -; SSE41-NEXT: shrq $2, %rax -; SSE41-NEXT: leaq (,%rax,8), %rdx -; SSE41-NEXT: subq %rdx, %rax -; SSE41-NEXT: addq %rcx, %rax -; SSE41-NEXT: movq %rax, %xmm1 -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %rsi -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: subq %rdx, %rax -; SSE41-NEXT: shrq %rax -; SSE41-NEXT: addq %rdx, %rax -; SSE41-NEXT: shrq $2, %rax -; SSE41-NEXT: leaq (,%rax,8), %rdx -; SSE41-NEXT: subq %rdx, %rax -; SSE41-NEXT: addq %rcx, %rax -; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: movq %rdx, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: movq %rdx, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psubq %xmm2, %xmm1 +; SSE41-NEXT: psrlq $1, %xmm1 +; SSE41-NEXT: paddq %xmm2, %xmm1 +; SSE41-NEXT: psrlq $2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psllq $3, %xmm2 +; SSE41-NEXT: psubq %xmm2, %xmm1 +; SSE41-NEXT: paddq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_rem7_2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm2 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpsrlq $1, %xmm2, %xmm2 +; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpsrlq $2, %xmm1, %xmm1 +; AVX-NEXT: vpsllq $3, %xmm1, %xmm2 +; AVX-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %res = urem <2 x i64> %a, ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -10,39 +10,29 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind { ; AVX1-LABEL: test_div7_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: addq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: addq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: addq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: addq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -50,38 +40,25 @@ ; AVX2-LABEL: test_div7_4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: subq %rdx, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: subq %rdx, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: subq %rdx, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: subq %rdx, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlq $2, %ymm0, %ymm0 ; AVX2-NEXT: retq %res = udiv <4 x i64> %a, @@ -370,115 +347,64 @@ ; AVX1-LABEL: test_rem7_4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: addq %rdx, %rax -; AVX1-NEXT: shrq $2, %rax -; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: addq %rdx, %rax -; AVX1-NEXT: shrq $2, %rax -; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: addq %rdx, %rax -; AVX1-NEXT: shrq $2, %rax -; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: addq %rdx, %rax -; AVX1-NEXT: shrq $2, %rax -; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: mulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_rem7_4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: shrq $2, %rax -; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: shrq $2, %rax -; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: shrq $2, %rax -; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: shrq $2, %rax -; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: mulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlq $1, %ymm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpsrlq $2, %ymm1, %ymm1 +; AVX2-NEXT: vpsllq $3, %ymm1, %ymm2 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %res = urem <4 x i64> %a, ret <4 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -9,73 +9,44 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind { ; AVX-LABEL: test_div7_8i64: ; AVX: # %bb.0: -; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX-NEXT: vpextrq $1, %xmm1, %rcx -; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm2 -; AVX-NEXT: vmovq %xmm1, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm2, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm1, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm2, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm4 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX-NEXT: vpsrlq $1, %zmm0, %zmm0 +; AVX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX-NEXT: vpsrlq $2, %zmm0, %zmm0 ; AVX-NEXT: retq %res = udiv <8 x i64> %a, @@ -295,113 +266,48 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-LABEL: test_rem7_8i64: ; AVX: # %bb.0: -; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX-NEXT: vpextrq $1, %xmm1, %rcx -; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vmovq %xmm1, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm2, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm1, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vpextrq $1, %xmm2, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm2, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm3 -; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: vmovq %rdx, %xmm4 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX-NEXT: vpsubq %zmm1, %zmm0, %zmm2 +; AVX-NEXT: vpsrlq $1, %zmm2, %zmm2 +; AVX-NEXT: vpaddq %zmm1, %zmm2, %zmm1 +; AVX-NEXT: vpsrlq $2, %zmm1, %zmm1 +; AVX-NEXT: vpsllq $3, %zmm1, %zmm2 +; AVX-NEXT: vpsubq %zmm2, %zmm1, %zmm1 +; AVX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX-NEXT: retq %res = urem <8 x i64> %a, ret <8 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll --- a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll @@ -6,17 +6,16 @@ ; X64-LABEL: test_udiv7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; X64-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; X64-NEXT: psubd %xmm2, %xmm0 -; X64-NEXT: psrld $1, %xmm0 -; X64-NEXT: paddd %xmm2, %xmm0 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: psrlq $1, %xmm0 +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X64-NEXT: psrld $2, %xmm0 ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq @@ -26,18 +25,18 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X86-NEXT: psrlq $32, %xmm1 ; X86-NEXT: psubd %xmm2, %xmm0 -; X86-NEXT: psrld $1, %xmm0 -; X86-NEXT: paddd %xmm2, %xmm0 +; X86-NEXT: pxor %xmm2, %xmm2 +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-NEXT: psllq $31, %xmm0 +; X86-NEXT: psrlq $32, %xmm0 +; X86-NEXT: paddd %xmm1, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: psrld $2, %xmm0 ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl @@ -51,18 +50,17 @@ ; X64-LABEL: test_urem7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psubd %xmm2, %xmm1 -; X64-NEXT: psrld $1, %xmm1 -; X64-NEXT: paddd %xmm2, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; X64-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: psubd %xmm2, %xmm3 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-NEXT: psrlq $1, %xmm3 +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: paddd %xmm3, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-NEXT: psrld $2, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm2 ; X64-NEXT: pslld $3, %xmm2 @@ -76,19 +74,19 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X86-NEXT: movdqa %xmm0, %xmm1 -; X86-NEXT: psubd %xmm2, %xmm1 -; X86-NEXT: psrld $1, %xmm1 -; X86-NEXT: paddd %xmm2, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; X86-NEXT: psrlq $32, %xmm1 +; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: psubd %xmm2, %xmm3 +; X86-NEXT: pxor %xmm2, %xmm2 +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X86-NEXT: psllq $31, %xmm3 +; X86-NEXT: psrlq $32, %xmm3 +; X86-NEXT: paddd %xmm1, %xmm3 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; X86-NEXT: psrld $2, %xmm1 ; X86-NEXT: movdqa %xmm1, %xmm2 ; X86-NEXT: pslld $3, %xmm2 @@ -106,25 +104,27 @@ ; X64-LABEL: test_sdiv7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X64-NEXT: pxor %xmm3, %xmm3 -; X64-NEXT: pcmpgtd %xmm0, %xmm3 -; X64-NEXT: pand %xmm1, %xmm3 -; X64-NEXT: paddd %xmm0, %xmm3 -; X64-NEXT: psubd %xmm3, %xmm2 -; X64-NEXT: paddd %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; X64-NEXT: movdqa {{.*#+}} xmm3 = [18446744071868851347,18446744071868851347] +; X64-NEXT: pmuludq %xmm3, %xmm2 +; X64-NEXT: movdqa %xmm0, %xmm4 +; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X64-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; X64-NEXT: pmuludq %xmm4, %xmm1 +; X64-NEXT: paddq %xmm2, %xmm1 +; X64-NEXT: psllq $32, %xmm1 +; X64-NEXT: pmuludq %xmm3, %xmm4 +; X64-NEXT: paddq %xmm1, %xmm4 +; X64-NEXT: psrlq $32, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: psrld $31, %xmm0 -; X64-NEXT: psrad $2, %xmm2 -; X64-NEXT: paddd %xmm0, %xmm2 -; X64-NEXT: movq %xmm2, (%rsi) +; X64-NEXT: psrad $2, %xmm1 +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: movq %xmm1, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_sdiv7_v2i32: @@ -132,26 +132,27 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X86-NEXT: pxor %xmm3, %xmm3 -; X86-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-NEXT: pand %xmm1, %xmm3 -; X86-NEXT: paddd %xmm0, %xmm3 -; X86-NEXT: psubd %xmm3, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm0 +; X86-NEXT: pxor %xmm1, %xmm1 +; X86-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; X86-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,4294967295,2454267027,4294967295] +; X86-NEXT: pmuludq %xmm3, %xmm2 +; X86-NEXT: movdqa %xmm0, %xmm4 +; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X86-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-NEXT: pmuludq %xmm4, %xmm1 +; X86-NEXT: paddq %xmm2, %xmm1 +; X86-NEXT: psllq $32, %xmm1 +; X86-NEXT: pmuludq %xmm3, %xmm4 +; X86-NEXT: paddq %xmm1, %xmm4 +; X86-NEXT: psrlq $32, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm0 ; X86-NEXT: psrld $31, %xmm0 -; X86-NEXT: psrad $2, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movq %xmm2, (%eax) +; X86-NEXT: psrad $2, %xmm1 +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = sdiv <2 x i32> %a, @@ -163,29 +164,31 @@ ; X64-LABEL: test_srem7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X64-NEXT: pxor %xmm3, %xmm3 -; X64-NEXT: pcmpgtd %xmm0, %xmm3 -; X64-NEXT: pand %xmm1, %xmm3 -; X64-NEXT: paddd %xmm0, %xmm3 -; X64-NEXT: psubd %xmm3, %xmm2 -; X64-NEXT: paddd %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm1 -; X64-NEXT: psrld $31, %xmm1 -; X64-NEXT: psrad $2, %xmm2 -; X64-NEXT: paddd %xmm1, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm1 -; X64-NEXT: pslld $3, %xmm1 -; X64-NEXT: psubd %xmm1, %xmm2 -; X64-NEXT: paddd %xmm0, %xmm2 -; X64-NEXT: movq %xmm2, (%rsi) +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; X64-NEXT: movdqa {{.*#+}} xmm3 = [18446744071868851347,18446744071868851347] +; X64-NEXT: pmuludq %xmm3, %xmm2 +; X64-NEXT: movdqa %xmm0, %xmm4 +; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X64-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; X64-NEXT: pmuludq %xmm4, %xmm1 +; X64-NEXT: paddq %xmm2, %xmm1 +; X64-NEXT: psllq $32, %xmm1 +; X64-NEXT: pmuludq %xmm3, %xmm4 +; X64-NEXT: paddq %xmm1, %xmm4 +; X64-NEXT: psrlq $32, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: psrld $31, %xmm2 +; X64-NEXT: psrad $2, %xmm1 +; X64-NEXT: paddd %xmm2, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: pslld $3, %xmm2 +; X64-NEXT: psubd %xmm2, %xmm1 +; X64-NEXT: paddd %xmm0, %xmm1 +; X64-NEXT: movq %xmm1, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_srem7_v2i32: @@ -193,30 +196,31 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X86-NEXT: pxor %xmm3, %xmm3 -; X86-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-NEXT: pand %xmm1, %xmm3 -; X86-NEXT: paddd %xmm0, %xmm3 -; X86-NEXT: psubd %xmm3, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm1 -; X86-NEXT: psrld $31, %xmm1 -; X86-NEXT: psrad $2, %xmm2 -; X86-NEXT: paddd %xmm1, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm1 -; X86-NEXT: pslld $3, %xmm1 -; X86-NEXT: psubd %xmm1, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movq %xmm2, (%eax) +; X86-NEXT: pxor %xmm1, %xmm1 +; X86-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; X86-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,4294967295,2454267027,4294967295] +; X86-NEXT: pmuludq %xmm3, %xmm2 +; X86-NEXT: movdqa %xmm0, %xmm4 +; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X86-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-NEXT: pmuludq %xmm4, %xmm1 +; X86-NEXT: paddq %xmm2, %xmm1 +; X86-NEXT: psllq $32, %xmm1 +; X86-NEXT: pmuludq %xmm3, %xmm4 +; X86-NEXT: paddq %xmm1, %xmm4 +; X86-NEXT: psrlq $32, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: psrld $31, %xmm2 +; X86-NEXT: psrad $2, %xmm1 +; X86-NEXT: paddd %xmm2, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: pslld $3, %xmm2 +; X86-NEXT: psubd %xmm2, %xmm1 +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = srem <2 x i32> %a, diff --git a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll --- a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll @@ -8,29 +8,31 @@ define <8 x i8> @vshli_target_constant(<8 x i16> %arg, <8 x i32> %arg1) { ; CHECK-LABEL: vshli_target_constant: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] -; CHECK-NEXT: pmuludq %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,3,3] +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2863311531,2863311531] +; CHECK-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-NEXT: psrlq $33, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; CHECK-NEXT: pmuludq %xmm4, %xmm0 +; CHECK-NEXT: psrlq $33, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,3,3] +; CHECK-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-NEXT: psrlq $33, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; CHECK-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-NEXT: psrlq $33, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] ; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: pslld $15, %xmm2 +; CHECK-NEXT: pslld $16, %xmm2 ; CHECK-NEXT: psrad $16, %xmm2 -; CHECK-NEXT: pslld $15, %xmm4 -; CHECK-NEXT: psrad $16, %xmm4 -; CHECK-NEXT: packssdw %xmm2, %xmm4 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm4 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; CHECK-NEXT: pmullw %xmm4, %xmm1 +; CHECK-NEXT: pslld $16, %xmm0 +; CHECK-NEXT: psrad $16, %xmm0 +; CHECK-NEXT: packssdw %xmm2, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; CHECK-NEXT: pmullw %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]