Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5996,17 +5996,18 @@ return SDValue(LoHi.getNode(), 1); } // If type twice as wide legal, widen and use a mul plus a shift. - if (!VT.isVector()) { - unsigned Size = VT.getSizeInBits(); - EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), Size * 2); - if (isOperationLegal(ISD::MUL, WideVT)) { - X = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, X); - Y = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, Y); - Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y); - Y = DAG.getNode(ISD::SRL, dl, WideVT, Y, - DAG.getShiftAmountConstant(EltBits, WideVT, dl)); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Y); - } + unsigned Size = VT.getScalarSizeInBits(); + EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), Size * 2); + if (VT.isVector()) + WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, + VT.getVectorElementCount()); + if (isOperationLegalOrCustom(ISD::MUL, WideVT)) { + X = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, X); + Y = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, Y); + Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y); + Y = DAG.getNode(ISD::SRL, dl, WideVT, Y, + DAG.getShiftAmountConstant(EltBits, WideVT, dl)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Y); } return SDValue(); }; @@ -6182,17 +6183,18 @@ return SDValue(LoHi.getNode(), 1); } // If type twice as wide legal, widen and use a mul plus a shift. - if (!VT.isVector()) { - unsigned Size = VT.getSizeInBits(); - EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), Size * 2); - if (isOperationLegal(ISD::MUL, WideVT)) { - X = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, X); - Y = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, Y); - Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y); - Y = DAG.getNode(ISD::SRL, dl, WideVT, Y, - DAG.getShiftAmountConstant(EltBits, WideVT, dl)); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Y); - } + unsigned Size = VT.getScalarSizeInBits(); + EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), Size * 2); + if (VT.isVector()) + WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, + VT.getVectorElementCount()); + if (isOperationLegalOrCustom(ISD::MUL, WideVT)) { + X = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, X); + Y = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, Y); + Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y); + Y = DAG.getNode(ISD::SRL, dl, WideVT, Y, + DAG.getShiftAmountConstant(EltBits, WideVT, dl)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Y); } return SDValue(); // No mulhu or equivalent }; Index: llvm/test/CodeGen/AArch64/srem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -4,50 +4,20 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; CHECK-LABEL: fold_srem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: mov w10, #63421 // =0xf7bd -; CHECK-NEXT: mov w11, #37253 // =0x9185 -; CHECK-NEXT: movk w10, #31710, lsl #16 -; CHECK-NEXT: movk w11, #44150, lsl #16 -; CHECK-NEXT: smov w13, v0.h[2] -; CHECK-NEXT: mov w12, #33437 // =0x829d -; CHECK-NEXT: smull x10, w8, w10 -; CHECK-NEXT: movk w12, #21399, lsl #16 -; CHECK-NEXT: smull x11, w9, w11 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: sub w10, w10, w8 -; CHECK-NEXT: add w11, w11, w9 -; CHECK-NEXT: asr w14, w10, #6 -; CHECK-NEXT: asr w15, w11, #6 -; CHECK-NEXT: add w10, w14, w10, lsr #31 -; CHECK-NEXT: add w11, w15, w11, lsr #31 -; CHECK-NEXT: mov w14, #95 // =0x5f -; CHECK-NEXT: mov w15, #-124 // =0xffffff84 -; CHECK-NEXT: smull x12, w13, w12 -; CHECK-NEXT: msub w9, w11, w14, w9 -; CHECK-NEXT: msub w8, w10, w15, w8 -; CHECK-NEXT: lsr x10, x12, #63 -; CHECK-NEXT: asr x11, x12, #37 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: add w10, w11, w10 -; CHECK-NEXT: mov w11, #98 // =0x62 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov w9, #63249 // =0xf711 -; CHECK-NEXT: movk w9, #48808, lsl #16 -; CHECK-NEXT: msub w10, w10, w11, w13 -; CHECK-NEXT: smull x9, w12, w9 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: lsr x8, x9, #63 -; CHECK-NEXT: asr x9, x9, #40 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: mov w9, #-1003 // =0xfffffc15 -; CHECK-NEXT: mov v0.h[2], w10 -; CHECK-NEXT: msub w8, w8, w9, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: mla v1.4h, v0.4h, v2.4h +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: adrp x8, .LCPI0_3 +; CHECK-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -56,43 +26,15 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; CHECK-LABEL: fold_srem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: mov w8, #37253 // =0x9185 -; CHECK-NEXT: movk w8, #44150, lsl #16 -; CHECK-NEXT: smov w10, v0.h[1] -; CHECK-NEXT: smov w14, v0.h[2] -; CHECK-NEXT: mov w12, #95 // =0x5f -; CHECK-NEXT: smull x11, w9, w8 -; CHECK-NEXT: smull x13, w10, w8 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: add w11, w11, w9 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: asr w15, w11, #6 -; CHECK-NEXT: add w13, w13, w10 -; CHECK-NEXT: add w11, w15, w11, lsr #31 -; CHECK-NEXT: smov w15, v0.h[3] -; CHECK-NEXT: asr w16, w13, #6 -; CHECK-NEXT: msub w9, w11, w12, w9 -; CHECK-NEXT: add w13, w16, w13, lsr #31 -; CHECK-NEXT: smull x11, w14, w8 -; CHECK-NEXT: msub w10, w13, w12, w10 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: smull x8, w15, w8 -; CHECK-NEXT: add w11, w11, w14 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: asr w9, w11, #6 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: add w9, w9, w11, lsr #31 -; CHECK-NEXT: add w8, w8, w15 -; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: asr w10, w8, #6 -; CHECK-NEXT: msub w9, w9, w12, w14 -; CHECK-NEXT: add w8, w10, w8, lsr #31 -; CHECK-NEXT: msub w8, w8, w12, w15 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov w8, #44151 // =0xac77 +; CHECK-NEXT: movi v2.4h, #95 +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: add v1.4h, v1.4h, v0.4h +; CHECK-NEXT: sshr v1.4h, v1.4h, #6 +; CHECK-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -103,46 +45,15 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; CHECK-LABEL: combine_srem_sdiv: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: mov w8, #37253 // =0x9185 -; CHECK-NEXT: movk w8, #44150, lsl #16 -; CHECK-NEXT: smov w10, v0.h[1] -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: mov w14, #95 // =0x5f -; CHECK-NEXT: smull x13, w9, w8 -; CHECK-NEXT: smull x15, w10, w8 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: smull x16, w11, w8 -; CHECK-NEXT: add w13, w13, w9 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: asr w17, w13, #6 -; CHECK-NEXT: add w15, w15, w10 -; CHECK-NEXT: add w13, w17, w13, lsr #31 -; CHECK-NEXT: asr w17, w15, #6 -; CHECK-NEXT: add w15, w17, w15, lsr #31 -; CHECK-NEXT: smull x8, w12, w8 -; CHECK-NEXT: msub w9, w13, w14, w9 -; CHECK-NEXT: lsr x16, x16, #32 -; CHECK-NEXT: add w16, w16, w11 -; CHECK-NEXT: msub w10, w15, w14, w10 -; CHECK-NEXT: asr w17, w16, #6 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: fmov s1, w13 -; CHECK-NEXT: add w16, w17, w16, lsr #31 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add w8, w8, w12 -; CHECK-NEXT: asr w9, w8, #6 -; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: msub w9, w16, w14, w11 -; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: mov v1.h[1], w15 -; CHECK-NEXT: msub w10, w8, w14, w12 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: mov v1.h[2], w16 -; CHECK-NEXT: mov v0.h[3], w10 -; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: mov w8, #44151 // =0xac77 +; CHECK-NEXT: movi v2.4h, #95 +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: add v1.4h, v1.4h, v0.4h +; CHECK-NEXT: sshr v1.4h, v1.4h, #6 +; CHECK-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, @@ -155,37 +66,18 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_power_of_two: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w9, v0.h[1] -; CHECK-NEXT: smov w10, v0.h[0] -; CHECK-NEXT: mov w8, #37253 // =0x9185 -; CHECK-NEXT: smov w12, v0.h[2] -; CHECK-NEXT: movk w8, #44150, lsl #16 -; CHECK-NEXT: negs w11, w9 -; CHECK-NEXT: and w9, w9, #0x1f -; CHECK-NEXT: and w11, w11, #0x1f -; CHECK-NEXT: csneg w9, w9, w11, mi -; CHECK-NEXT: negs w11, w10 -; CHECK-NEXT: and w10, w10, #0x3f -; CHECK-NEXT: and w11, w11, #0x3f -; CHECK-NEXT: csneg w10, w10, w11, mi -; CHECK-NEXT: smov w11, v0.h[3] -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: negs w10, w12 -; CHECK-NEXT: smull x8, w11, w8 -; CHECK-NEXT: and w10, w10, #0x7 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: and w9, w12, #0x7 -; CHECK-NEXT: add w8, w8, w11 -; CHECK-NEXT: csneg w9, w9, w10, mi -; CHECK-NEXT: asr w10, w8, #6 -; CHECK-NEXT: add w8, w10, w8, lsr #31 -; CHECK-NEXT: mov w10, #95 // =0x5f -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w8, w10, w11 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: add v1.4h, v1.4h, v0.4h +; CHECK-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -195,41 +87,22 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: mov w9, #30865 // =0x7891 -; CHECK-NEXT: movk w9, #51306, lsl #16 -; CHECK-NEXT: smov w10, v0.h[2] -; CHECK-NEXT: mov w11, #17097 // =0x42c9 -; CHECK-NEXT: mov w12, #654 // =0x28e -; CHECK-NEXT: movk w11, #45590, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: smull x11, w10, w11 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w9, w9, w8 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: asr w13, w9, #9 -; CHECK-NEXT: add w11, w11, w10 -; CHECK-NEXT: add w9, w13, w9, lsr #31 -; CHECK-NEXT: mov w13, #23 // =0x17 -; CHECK-NEXT: msub w8, w9, w12, w8 -; CHECK-NEXT: asr w9, w11, #4 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: add w9, w9, w11, lsr #31 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov w11, #47143 // =0xb827 -; CHECK-NEXT: movk w11, #24749, lsl #16 -; CHECK-NEXT: msub w9, w9, w13, w10 -; CHECK-NEXT: smull x10, w12, w11 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: lsr x8, x10, #63 -; CHECK-NEXT: asr x10, x10, #43 -; CHECK-NEXT: add w8, w10, w8 -; CHECK-NEXT: mov w10, #5423 // =0x152f -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w8, w10, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: movi d2, #0x00ffff0000ffff +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: and v2.8b, v0.8b, v2.8b +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-NEXT: sshl v1.4h, v1.4h, v3.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: ushr v2.4h, v1.4h, #15 +; CHECK-NEXT: mov v2.h[0], wzr +; CHECK-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -239,36 +112,22 @@ define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_i16_smax: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w9, v0.h[2] -; CHECK-NEXT: mov w8, #17097 // =0x42c9 -; CHECK-NEXT: movk w8, #45590, lsl #16 -; CHECK-NEXT: smov w10, v0.h[1] -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: mov w11, #23 // =0x17 -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: smull x8, w9, w8 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: asr w13, w8, #4 -; CHECK-NEXT: add w8, w13, w8, lsr #31 -; CHECK-NEXT: negs w13, w10 -; CHECK-NEXT: and w10, w10, #0x7fff -; CHECK-NEXT: and w13, w13, #0x7fff -; CHECK-NEXT: csneg w10, w10, w13, mi -; CHECK-NEXT: mov w13, #47143 // =0xb827 -; CHECK-NEXT: movk w13, #24749, lsl #16 -; CHECK-NEXT: msub w8, w8, w11, w9 -; CHECK-NEXT: smull x9, w12, w13 -; CHECK-NEXT: mov v1.h[1], w10 -; CHECK-NEXT: lsr x10, x9, #63 -; CHECK-NEXT: asr x9, x9, #43 -; CHECK-NEXT: add w9, w9, w10 -; CHECK-NEXT: mov w10, #5423 // =0x152f -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: msub w8, w9, w10, w12 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: adrp x8, .LCPI5_1 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI5_1] +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: adrp x8, .LCPI5_2 +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: mla v1.4h, v0.4h, v2.4h +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI5_2] +; CHECK-NEXT: adrp x8, .LCPI5_3 +; CHECK-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI5_3] +; CHECK-NEXT: ushr v2.4h, v1.4h, #15 +; CHECK-NEXT: mov v2.h[0], wzr +; CHECK-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -337,67 +196,13 @@ define <8 x i8> @fold_srem_v8i8(<8 x i8> %x) { ; CHECK-LABEL: fold_srem_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w10, v0.b[0] -; CHECK-NEXT: mov w8, #26215 // =0x6667 -; CHECK-NEXT: movk w8, #26214, lsl #16 -; CHECK-NEXT: smov w11, v0.b[1] -; CHECK-NEXT: smull x9, w10, w8 -; CHECK-NEXT: smull x14, w11, w8 -; CHECK-NEXT: lsr x12, x9, #63 -; CHECK-NEXT: asr x13, x9, #34 -; CHECK-NEXT: mov w9, #10 // =0xa -; CHECK-NEXT: add w12, w13, w12 -; CHECK-NEXT: smov w13, v0.b[2] -; CHECK-NEXT: msub w10, w12, w9, w10 -; CHECK-NEXT: lsr x12, x14, #63 -; CHECK-NEXT: asr x14, x14, #34 -; CHECK-NEXT: add w12, w14, w12 -; CHECK-NEXT: smov w14, v0.b[3] -; CHECK-NEXT: smull x15, w13, w8 -; CHECK-NEXT: msub w11, w12, w9, w11 -; CHECK-NEXT: fmov s1, w10 -; CHECK-NEXT: lsr x10, x15, #63 -; CHECK-NEXT: asr x12, x15, #34 -; CHECK-NEXT: add w10, w12, w10 -; CHECK-NEXT: smull x12, w14, w8 -; CHECK-NEXT: smov w15, v0.b[4] -; CHECK-NEXT: mov v1.b[1], w11 -; CHECK-NEXT: msub w10, w10, w9, w13 -; CHECK-NEXT: lsr x11, x12, #63 -; CHECK-NEXT: asr x12, x12, #34 -; CHECK-NEXT: add w11, w12, w11 -; CHECK-NEXT: smov w13, v0.b[5] -; CHECK-NEXT: smull x12, w15, w8 -; CHECK-NEXT: mov v1.b[2], w10 -; CHECK-NEXT: msub w10, w11, w9, w14 -; CHECK-NEXT: lsr x11, x12, #63 -; CHECK-NEXT: asr x12, x12, #34 -; CHECK-NEXT: add w11, w12, w11 -; CHECK-NEXT: smull x12, w13, w8 -; CHECK-NEXT: smov w14, v0.b[6] -; CHECK-NEXT: mov v1.b[3], w10 -; CHECK-NEXT: msub w10, w11, w9, w15 -; CHECK-NEXT: lsr x11, x12, #63 -; CHECK-NEXT: asr x12, x12, #34 -; CHECK-NEXT: add w11, w12, w11 -; CHECK-NEXT: smov w15, v0.b[7] -; CHECK-NEXT: smull x12, w14, w8 -; CHECK-NEXT: mov v1.b[4], w10 -; CHECK-NEXT: msub w10, w11, w9, w13 -; CHECK-NEXT: lsr x11, x12, #63 -; CHECK-NEXT: asr x12, x12, #34 -; CHECK-NEXT: add w11, w12, w11 -; CHECK-NEXT: smull x8, w15, w8 -; CHECK-NEXT: mov v1.b[5], w10 -; CHECK-NEXT: msub w10, w11, w9, w14 -; CHECK-NEXT: lsr x11, x8, #63 -; CHECK-NEXT: asr x8, x8, #34 -; CHECK-NEXT: add w8, w8, w11 -; CHECK-NEXT: mov v1.b[6], w10 -; CHECK-NEXT: msub w8, w8, w9, w15 -; CHECK-NEXT: mov v1.b[7], w8 -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: movi v1.8b, #103 +; CHECK-NEXT: movi v2.8b, #10 +; CHECK-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-NEXT: sshr v1.8b, v1.8b, #2 +; CHECK-NEXT: usra v1.8b, v1.8b, #7 +; CHECK-NEXT: mls v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %1 = srem <8 x i8> %x, ret <8 x i8> %1 @@ -423,39 +228,14 @@ define <4 x i16> @fold_srem_v4i16(<4 x i16> %x) { ; CHECK-LABEL: fold_srem_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w9, v0.h[0] ; CHECK-NEXT: mov w8, #26215 // =0x6667 -; CHECK-NEXT: movk w8, #26214, lsl #16 -; CHECK-NEXT: smov w10, v0.h[1] -; CHECK-NEXT: mov w13, #10 // =0xa -; CHECK-NEXT: smull x11, w9, w8 -; CHECK-NEXT: smull x14, w10, w8 -; CHECK-NEXT: lsr x12, x11, #63 -; CHECK-NEXT: asr x11, x11, #34 -; CHECK-NEXT: add w11, w11, w12 -; CHECK-NEXT: smov w12, v0.h[2] -; CHECK-NEXT: msub w9, w11, w13, w9 -; CHECK-NEXT: lsr x11, x14, #63 -; CHECK-NEXT: asr x14, x14, #34 -; CHECK-NEXT: add w11, w14, w11 -; CHECK-NEXT: smov w14, v0.h[3] -; CHECK-NEXT: smull x15, w12, w8 -; CHECK-NEXT: msub w10, w11, w13, w10 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: lsr x9, x15, #63 -; CHECK-NEXT: asr x11, x15, #34 -; CHECK-NEXT: add w9, w11, w9 -; CHECK-NEXT: smull x8, w14, w8 -; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: msub w9, w9, w13, w12 -; CHECK-NEXT: lsr x10, x8, #63 -; CHECK-NEXT: asr x8, x8, #34 -; CHECK-NEXT: add w8, w8, w10 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w8, w13, w14 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: movi v2.4h, #10 +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: sshr v1.4s, v1.4s, #18 +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -482,25 +262,17 @@ define <2 x i32> @fold_srem_v2i32(<2 x i32> %x) { ; CHECK-LABEL: fold_srem_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov w8, #26215 // =0x6667 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: movi v3.2s, #10 ; CHECK-NEXT: movk w8, #26214, lsl #16 -; CHECK-NEXT: mov w10, v0.s[1] -; CHECK-NEXT: smull x11, w9, w8 -; CHECK-NEXT: lsr x12, x11, #63 -; CHECK-NEXT: asr x11, x11, #34 -; CHECK-NEXT: add w11, w11, w12 -; CHECK-NEXT: mov w12, #10 // =0xa -; CHECK-NEXT: smull x8, w10, w8 -; CHECK-NEXT: msub w9, w11, w12, w9 -; CHECK-NEXT: lsr x11, x8, #63 -; CHECK-NEXT: asr x8, x8, #34 -; CHECK-NEXT: add w8, w8, w11 -; CHECK-NEXT: msub w8, w8, w12, w10 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: ushr v2.2d, v1.2d, #63 +; CHECK-NEXT: sshr v1.2d, v1.2d, #34 +; CHECK-NEXT: xtn v2.2s, v2.2d +; CHECK-NEXT: xtn v1.2s, v1.2d +; CHECK-NEXT: add v1.2s, v1.2s, v2.2s +; CHECK-NEXT: mls v0.2s, v1.2s, v3.2s ; CHECK-NEXT: ret %1 = srem <2 x i32> %x, ret <2 x i32> %1 Index: llvm/test/CodeGen/AArch64/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/urem-vector-lkk.ll +++ llvm/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -4,40 +4,25 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: mov w9, #55879 // =0xda47 -; CHECK-NEXT: movk w9, #689, lsl #16 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: mov w11, #33826 // =0x8422 -; CHECK-NEXT: mov w12, #95 // =0x5f -; CHECK-NEXT: movk w11, #528, lsl #16 -; CHECK-NEXT: umov w13, v0.h[2] -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: msub w8, w9, w12, w8 -; CHECK-NEXT: mov w9, #48149 // =0xbc15 -; CHECK-NEXT: movk w9, #668, lsl #16 -; CHECK-NEXT: mov w12, #124 // =0x7c -; CHECK-NEXT: umull x9, w13, w9 -; CHECK-NEXT: msub w10, w11, w12, w10 -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov w12, #22281 // =0x5709 -; CHECK-NEXT: lsr x8, x9, #32 -; CHECK-NEXT: mov w9, #98 // =0x62 -; CHECK-NEXT: movk w12, #65, lsl #16 -; CHECK-NEXT: msub w8, w8, w9, w13 -; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: umull x9, w11, w12 -; CHECK-NEXT: mov w10, #1003 // =0x3eb -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: msub w8, w9, w10, w11 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: adrp x9, .LCPI0_1 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI0_1] +; CHECK-NEXT: adrp x9, .LCPI0_4 +; CHECK-NEXT: ushl v1.4h, v0.4h, v1.4h +; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: adrp x8, .LCPI0_3 +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h +; CHECK-NEXT: umull v2.4s, v3.4h, v2.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: add v1.4h, v2.4h, v1.4h +; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI0_4] +; CHECK-NEXT: ushl v1.4h, v1.4h, v3.4h +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -46,31 +31,13 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: mov w9, #55879 // =0xda47 -; CHECK-NEXT: movk w9, #689, lsl #16 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: mov w12, #95 // =0x5f -; CHECK-NEXT: umov w13, v0.h[2] -; CHECK-NEXT: umull x11, w8, w9 -; CHECK-NEXT: umull x14, w10, w9 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: msub w8, w11, w12, w8 -; CHECK-NEXT: lsr x11, x14, #32 -; CHECK-NEXT: umull x14, w13, w9 -; CHECK-NEXT: msub w10, w11, w12, w10 -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: lsr x8, x14, #32 -; CHECK-NEXT: msub w8, w8, w12, w13 -; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: umull x9, w11, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: msub w8, w9, w12, w11 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov w8, #44151 // =0xac77 +; CHECK-NEXT: movi v2.4h, #95 +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ushr v1.4s, v1.4s, #22 +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -81,34 +48,13 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; CHECK-LABEL: combine_urem_udiv: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: mov w9, #55879 // =0xda47 -; CHECK-NEXT: movk w9, #689, lsl #16 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: mov w12, #95 // =0x5f -; CHECK-NEXT: umov w14, v0.h[2] -; CHECK-NEXT: umov w15, v0.h[3] -; CHECK-NEXT: umull x11, w8, w9 -; CHECK-NEXT: umull x13, w10, w9 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: msub w8, w11, w12, w8 -; CHECK-NEXT: msub w10, w13, w12, w10 -; CHECK-NEXT: fmov s1, w11 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: umull x8, w14, w9 -; CHECK-NEXT: umull x9, w15, w9 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: msub w10, w8, w12, w14 -; CHECK-NEXT: mov v1.h[1], w13 -; CHECK-NEXT: msub w11, w9, w12, w15 -; CHECK-NEXT: mov v0.h[2], w10 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: mov v0.h[3], w11 -; CHECK-NEXT: mov v1.h[3], w9 +; CHECK-NEXT: mov w8, #44151 // =0xac77 +; CHECK-NEXT: movi v2.4h, #95 +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ushr v1.4s, v1.4s, #22 +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, @@ -122,25 +68,16 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_urem_power_of_two: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w11, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[3] -; CHECK-NEXT: mov w8, #55879 // =0xda47 -; CHECK-NEXT: movk w8, #689, lsl #16 -; CHECK-NEXT: and w9, w9, #0x3f -; CHECK-NEXT: umull x8, w10, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: and w9, w11, #0x1f -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: mov w9, #95 // =0x5f -; CHECK-NEXT: and w11, w11, #0x7 -; CHECK-NEXT: msub w8, w8, w9, w10 -; CHECK-NEXT: mov v1.h[2], w11 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: adrp x9, .LCPI3_2 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: ldr d3, [x9, :lo12:.LCPI3_2] +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h +; CHECK-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -150,33 +87,27 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_urem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: mov w9, #13629 // =0x353d -; CHECK-NEXT: movk w9, #100, lsl #16 -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: mov w11, #25645 // =0x642d -; CHECK-NEXT: mov w12, #654 // =0x28e -; CHECK-NEXT: movk w11, #2849, lsl #16 -; CHECK-NEXT: mov w13, #5560 // =0x15b8 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: movk w13, #12, lsl #16 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: msub w8, w9, w12, w8 -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: mov w12, #23 // =0x17 -; CHECK-NEXT: msub w10, w11, w12, w10 -; CHECK-NEXT: mov w11, #5423 // =0x152f -; CHECK-NEXT: mov v1.h[1], w8 -; CHECK-NEXT: umull x8, w9, w13 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: mov v1.h[2], w10 -; CHECK-NEXT: msub w8, w8, w11, w9 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: adrp x8, .LCPI4_3 +; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h +; CHECK-NEXT: umull v2.4s, v3.4h, v2.4h +; CHECK-NEXT: movi d3, #0xffffffffffff0000 +; CHECK-NEXT: shrn v2.4h, v2.4s, #16 +; CHECK-NEXT: add v1.4h, v2.4h, v1.4h +; CHECK-NEXT: movi d2, #0x0000000000ffff +; CHECK-NEXT: ushl v1.4h, v1.4h, v4.4h +; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI4_3] +; CHECK-NEXT: and v1.8b, v1.8b, v3.8b +; CHECK-NEXT: and v2.8b, v0.8b, v2.8b +; CHECK-NEXT: orr v1.8b, v2.8b, v1.8b +; CHECK-NEXT: mls v0.4h, v1.4h, v4.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -252,51 +183,12 @@ define <8 x i8> @fold_urem_v8i8(<8 x i8> %x) { ; CHECK-LABEL: fold_urem_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.b[0] -; CHECK-NEXT: mov w9, #39322 // =0x999a -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: umov w10, v0.b[1] -; CHECK-NEXT: mov w12, #10 // =0xa -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: umull x11, w8, w9 -; CHECK-NEXT: umull x14, w10, w9 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: umull x15, w13, w9 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: msub w8, w11, w12, w8 -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: msub w10, w14, w12, w10 -; CHECK-NEXT: lsr x14, x15, #32 -; CHECK-NEXT: msub w13, w14, w12, w13 -; CHECK-NEXT: umov w14, v0.b[4] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: umull x8, w11, w9 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: mov v1.b[1], w10 -; CHECK-NEXT: umull x10, w14, w9 -; CHECK-NEXT: msub w8, w8, w12, w11 -; CHECK-NEXT: umov w11, v0.b[5] -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: mov v1.b[2], w13 -; CHECK-NEXT: msub w10, w10, w12, w14 -; CHECK-NEXT: umov w13, v0.b[6] -; CHECK-NEXT: mov v1.b[3], w8 -; CHECK-NEXT: umull x8, w11, w9 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: mov v1.b[4], w10 -; CHECK-NEXT: umull x10, w13, w9 -; CHECK-NEXT: msub w8, w8, w12, w11 -; CHECK-NEXT: umov w11, v0.b[7] -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: msub w10, w10, w12, w13 -; CHECK-NEXT: mov v1.b[5], w8 -; CHECK-NEXT: umull x8, w11, w9 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: mov v1.b[6], w10 -; CHECK-NEXT: msub w8, w8, w12, w11 -; CHECK-NEXT: mov v1.b[7], w8 -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: movi v1.8b, #205 +; CHECK-NEXT: movi v2.8b, #10 +; CHECK-NEXT: umull v1.8h, v0.8b, v1.8b +; CHECK-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-NEXT: ushr v1.8b, v1.8b, #3 +; CHECK-NEXT: mls v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret %1 = urem <8 x i8> %x, ret <8 x i8> %1 @@ -321,31 +213,13 @@ define <4 x i16> @fold_urem_v4i16(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: mov w9, #39322 // =0x999a -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: mov w12, #10 // =0xa -; CHECK-NEXT: umov w13, v0.h[2] -; CHECK-NEXT: umull x11, w8, w9 -; CHECK-NEXT: umull x14, w10, w9 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: msub w8, w11, w12, w8 -; CHECK-NEXT: lsr x11, x14, #32 -; CHECK-NEXT: umull x14, w13, w9 -; CHECK-NEXT: msub w10, w11, w12, w10 -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: lsr x8, x14, #32 -; CHECK-NEXT: msub w8, w8, w12, w13 -; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: umull x9, w11, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: msub w8, w9, w12, w11 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov w8, #52429 // =0xcccd +; CHECK-NEXT: movi v2.4h, #10 +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: ushr v1.4s, v1.4s, #19 +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -371,21 +245,14 @@ define <2 x i32> @fold_urem_v2i32(<2 x i32> %x) { ; CHECK-LABEL: fold_urem_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov w8, #52429 // =0xcccd -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: movi v2.2s, #10 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: mov w10, v0.s[1] -; CHECK-NEXT: mov w12, #10 // =0xa -; CHECK-NEXT: umull x11, w9, w8 -; CHECK-NEXT: lsr x11, x11, #35 -; CHECK-NEXT: umull x8, w10, w8 -; CHECK-NEXT: msub w9, w11, w12, w9 -; CHECK-NEXT: lsr x8, x8, #35 -; CHECK-NEXT: msub w8, w8, w12, w10 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: ushr v1.2d, v1.2d, #35 +; CHECK-NEXT: xtn v1.2s, v1.2d +; CHECK-NEXT: mls v0.2s, v1.2s, v2.2s ; CHECK-NEXT: ret %1 = urem <2 x i32> %x, ret <2 x i32> %1