diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -204,6 +204,10 @@ VTBL2, // 2-register shuffle with mask VMOVN, // MVE vmovn + // MVE Saturating truncates + VQMOVNs, // Vector (V) Saturating (Q) Move and Narrow (N), signed (s) + VQMOVNu, // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u) + // Vector multiply long: VMULLs, // ...signed VMULLu, // ...unsigned diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -946,6 +946,12 @@ setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::BITCAST); } + if (Subtarget->hasMVEIntegerOps()) { + setTargetDAGCombine(ISD::SMIN); + setTargetDAGCombine(ISD::UMIN); + setTargetDAGCombine(ISD::SMAX); + setTargetDAGCombine(ISD::UMAX); + } if (!Subtarget->hasFP64()) { // When targeting a floating-point unit with only single-precision @@ -1668,6 +1674,8 @@ case ARMISD::VTBL1: return "ARMISD::VTBL1"; case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMOVN: return "ARMISD::VMOVN"; + case ARMISD::VQMOVNs: return "ARMISD::VQMOVNs"; + case ARMISD::VQMOVNu: return "ARMISD::VQMOVNu"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; case ARMISD::VADDVs: return "ARMISD::VADDVs"; @@ -14864,6 +14872,107 @@ return SDValue(); } +/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating +/// saturates. +static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + if (!ST->hasMVEIntegerOps()) + return SDValue(); + + if (VT != MVT::v4i32 && VT != MVT::v8i16) + return SDValue(); + + auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) { + // Check one is a smin and the other is a smax + if (Min->getOpcode() != ISD::SMIN) + std::swap(Min, Max); + if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX) + return false; + + APInt SaturateC; + if (VT == MVT::v4i32) + SaturateC = APInt(32, (1 << 15) - 1, true); + else //if (VT == MVT::v8i16) + SaturateC = APInt(16, (1 << 7) - 1, true); + + APInt MinC, MaxC; + if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || + MinC != SaturateC) + return false; + if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) || + MaxC != ~SaturateC) + return false; + return true; + }; + + if (IsSignedSaturate(N, N0.getNode())) { + SDLoc DL(N); + MVT ExtVT, HalfVT; + if (VT == MVT::v4i32) { + HalfVT = MVT::v8i16; + ExtVT = MVT::v4i16; + } else { // if (VT == MVT::v8i16) + HalfVT = MVT::v16i8; + ExtVT = MVT::v8i8; + } + + // Create a VQMOVNB with undef top lanes, then signed extended into the top + // half. That extend will hopefully be removed if only the bottom bits are + // demanded (though a truncating store, for example). + SDValue VQMOVN = + DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT), + N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32)); + SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast, + DAG.getValueType(ExtVT)); + } + + auto IsUnsignedSaturate = [&](SDNode *Min) { + // For unsigned, we just need to check for <= 0xffff + if (Min->getOpcode() != ISD::UMIN) + return false; + + APInt SaturateC; + if (VT == MVT::v4i32) + SaturateC = APInt(32, (1 << 16) - 1, true); + else //if (VT == MVT::v8i16) + SaturateC = APInt(16, (1 << 8) - 1, true); + + APInt MinC; + if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || + MinC != SaturateC) + return false; + return true; + }; + + if (IsUnsignedSaturate(N)) { + SDLoc DL(N); + MVT HalfVT; + unsigned ExtConst; + if (VT == MVT::v4i32) { + HalfVT = MVT::v8i16; + ExtConst = 0x0000FFFF; + } else { //if (VT == MVT::v8i16) + HalfVT = MVT::v16i8; + ExtConst = 0x00FF; + } + + // Create a VQMOVNB with undef top lanes, then ZExt into the top half with + // an AND. That extend will hopefully be removed if only the bottom bits are + // demanded (though a truncating store, for example). + SDValue VQMOVN = + DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0, + DAG.getConstant(0, DL, MVT::i32)); + SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); + return DAG.getNode(ISD::AND, DL, VT, Bitcast, + DAG.getConstant(ExtConst, DL, VT)); + } + + return SDValue(); +} + static const APInt *isPowerOf2Constant(SDValue V) { ConstantSDNode *C = dyn_cast(V); if (!C) @@ -15419,7 +15528,13 @@ return PerformShiftCombine(N, DCI, Subtarget); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: - case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); + case ISD::ANY_EXTEND: + return PerformExtendCombine(N, DCI.DAG, Subtarget); + case ISD::SMIN: + case ISD::UMIN: + case ISD::SMAX: + case ISD::UMAX: + return PerformMinMaxCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); case ISD::LOAD: return PerformLOADCombine(N, DCI); diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4714,6 +4714,31 @@ defm : MVE_VQMOVN_p; defm : MVE_VQMOVN_p; +def SDTARMVMOVNQ : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVec<2>, SDTCisVT<3, i32>]>; +def MVEvqmovns : SDNode<"ARMISD::VQMOVNs", SDTARMVMOVNQ>; +def MVEvqmovnu : SDNode<"ARMISD::VQMOVNu", SDTARMVMOVNQ>; + +let Predicates = [HasMVEInt] in { + def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 0))), + (v8i16 (MVE_VQMOVNs32bh (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>; + def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 1))), + (v8i16 (MVE_VQMOVNs32th (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))), + (v16i8 (MVE_VQMOVNs16bh (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))), + (v16i8 (MVE_VQMOVNs16th (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + + def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 0))), + (v8i16 (MVE_VQMOVNu32bh (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>; + def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 1))), + (v8i16 (MVE_VQMOVNu32th (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))), + (v16i8 (MVE_VQMOVNu16bh (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))), + (v16i8 (MVE_VQMOVNu16th (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; +} + class MVE_VCVT_ff : MVE_qDest_qSrcThis Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.s32 q2, [r0], #8 -; CHECK-NEXT: vldrh.s32 q3, [r1], #8 -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vshr.s32 q2, q2, #15 -; CHECK-NEXT: vmax.s32 q2, q2, q0 -; CHECK-NEXT: vmin.s32 q2, q2, q1 -; CHECK-NEXT: vstrh.32 q2, [r2], #8 +; CHECK-NEXT: vldrh.s32 q0, [r0], #8 +; CHECK-NEXT: vldrh.s32 q1, [r1], #8 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vshr.s32 q0, q0, #15 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r2], #8 ; CHECK-NEXT: le lr, .LBB5_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 @@ -1266,29 +1263,25 @@ ; CHECK-NEXT: bic r5, r3, #7 ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: sub.w r6, r5, #8 -; CHECK-NEXT: vmvn.i32 q0, #0x7fff ; CHECK-NEXT: add.w r12, r0, r5, lsl #1 -; CHECK-NEXT: vmov.i32 q1, #0x7fff ; CHECK-NEXT: add.w lr, r4, r6, lsr #3 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1 ; CHECK-NEXT: add.w r6, r1, r5, lsl #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.s32 q2, [r0, #8] -; CHECK-NEXT: vldrh.s32 q3, [r1, #8] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vldrh.s32 q3, [r1], #16 -; CHECK-NEXT: vshr.s32 q2, q2, #15 -; CHECK-NEXT: vmax.s32 q2, q2, q0 -; CHECK-NEXT: vmin.s32 q2, q2, q1 -; CHECK-NEXT: vstrh.32 q2, [r2, #8] -; CHECK-NEXT: vldrh.s32 q2, [r0], #16 -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vshr.s32 q2, q2, #15 -; CHECK-NEXT: vmax.s32 q2, q2, q0 -; CHECK-NEXT: vmin.s32 q2, q2, q1 -; CHECK-NEXT: vstrh.32 q2, [r2], #16 +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1], #16 +; CHECK-NEXT: vshr.s32 q0, q0, #15 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r2, #8] +; CHECK-NEXT: vldrh.s32 q0, [r0], #16 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vshr.s32 q0, q0, #15 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB6_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 @@ -1399,8 +1392,6 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB7_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -1416,31 +1407,30 @@ ; CHECK-NEXT: bic r5, r3, #7 ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: sub.w r6, r5, #8 -; CHECK-NEXT: vmvn.i32 q0, #0x7fff ; CHECK-NEXT: add.w r12, r0, r5, lsl #1 -; CHECK-NEXT: vmov.i32 q1, #0x7fff ; CHECK-NEXT: add.w lr, r4, r6, lsr #3 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1 ; CHECK-NEXT: add.w r6, r1, r5, lsl #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q2, [r0], #16 -; CHECK-NEXT: vldrh.u16 q3, [r1], #16 -; CHECK-NEXT: vmullt.s16 q4, q3, q2 -; CHECK-NEXT: vmullb.s16 q2, q3, q2 -; CHECK-NEXT: vshr.s32 q4, q4, #15 +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r1], #16 +; CHECK-NEXT: vmullt.s16 q2, q1, q0 +; CHECK-NEXT: vmullb.s16 q0, q1, q0 ; CHECK-NEXT: vshr.s32 q2, q2, #15 -; CHECK-NEXT: vmax.s32 q4, q4, q0 -; CHECK-NEXT: vmax.s32 q2, q2, q0 -; CHECK-NEXT: vmin.s32 q4, q4, q1 -; CHECK-NEXT: vmin.s32 q2, q2, q1 -; CHECK-NEXT: vmovnt.i32 q2, q4 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vshr.s32 q0, q0, #15 +; CHECK-NEXT: vqmovnb.s32 q2, q2 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB7_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: beq .LBB7_8 +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader21 ; CHECK-NEXT: movw r0, #32768 ; CHECK-NEXT: sub.w lr, r3, r5 @@ -1462,7 +1452,6 @@ ; CHECK-NEXT: strh r3, [r4], #2 ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %cmp8 = icmp eq i32 %N, 0 @@ -1560,43 +1549,39 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: cbz r3, .LBB8_3 -; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r4, pc} ; CHECK-NEXT: add.w r12, r3, #3 ; CHECK-NEXT: adr r4, .LCPI8_0 ; CHECK-NEXT: bic r12, r12, #3 ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: vmvn.i32 q2, #0x7fff -; CHECK-NEXT: vmov.i32 q3, #0x7fff ; CHECK-NEXT: add.w lr, lr, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB8_2: @ %vector.body +; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vdup.32 q4, r3 +; CHECK-NEXT: vdup.32 q2, r3 ; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: vorr q4, q4, q0 -; CHECK-NEXT: vptt.u32 cs, q1, q4 -; CHECK-NEXT: vldrht.s32 q4, [r0], #8 -; CHECK-NEXT: vldrht.s32 q5, [r1], #8 -; CHECK-NEXT: vmul.i32 q4, q5, q4 -; CHECK-NEXT: vshr.s32 q4, q4, #15 -; CHECK-NEXT: vmax.s32 q4, q4, q2 -; CHECK-NEXT: vmin.s32 q4, q4, q3 +; CHECK-NEXT: vorr q2, q2, q0 +; CHECK-NEXT: vptt.u32 cs, q1, q2 +; CHECK-NEXT: vldrht.s32 q2, [r0], #8 +; CHECK-NEXT: vldrht.s32 q3, [r1], #8 +; CHECK-NEXT: vmul.i32 q2, q3, q2 +; CHECK-NEXT: vshr.s32 q2, q2, #15 +; CHECK-NEXT: vqmovnb.s32 q2, q2 +; CHECK-NEXT: vmovlb.s16 q2, q2 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.32 q4, [r2], #8 -; CHECK-NEXT: le lr, .LBB8_2 -; CHECK-NEXT: .LBB8_3: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vstrht.32 q2, [r2], #8 +; CHECK-NEXT: le lr, .LBB8_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.4: +; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI8_0: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 @@ -1653,8 +1638,8 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #56 -; CHECK-NEXT: sub sp, #56 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB9_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -1668,110 +1653,104 @@ ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: add.w lr, lr, r12, lsr #3 ; CHECK-NEXT: sub.w r12, r3, #1 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: vldrw.u32 q4, [r4] ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vmvn.i32 q5, #0x7fff -; CHECK-NEXT: vmov.i32 q6, #0x7fff +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB9_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vdup.32 q1, r3 -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload +; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: adds r3, #8 -; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vcmp.u32 cs, q7, q0 -; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: vmov.16 q0[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: vmov.16 q0[2], r4 -; CHECK-NEXT: vmov r4, s19 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[3], r4 -; CHECK-NEXT: vorr q1, q1, q4 -; CHECK-NEXT: vcmp.u32 cs, q7, q1 -; CHECK-NEXT: vpsel q1, q3, q2 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov.16 q0[4], r4 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vmov.16 q0[5], r4 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov.16 q0[6], r4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vpt.i16 ne, q0, zr -; CHECK-NEXT: vldrht.u16 q0, [r0], #16 -; CHECK-NEXT: vmov.u16 r4, q0[0] -; CHECK-NEXT: vmov.32 q7[0], r4 -; CHECK-NEXT: vmov.u16 r4, q0[1] -; CHECK-NEXT: vmov.32 q7[1], r4 -; CHECK-NEXT: vmov.u16 r4, q0[2] -; CHECK-NEXT: vmov.32 q7[2], r4 -; CHECK-NEXT: vmov.u16 r4, q0[3] -; CHECK-NEXT: vmov.32 q7[3], r4 +; CHECK-NEXT: vorr q5, q0, q5 +; CHECK-NEXT: vorr q0, q0, q4 +; CHECK-NEXT: vcmp.u32 cs, q1, q5 +; CHECK-NEXT: vpsel q6, q3, q2 +; CHECK-NEXT: vcmp.u32 cs, q1, q0 +; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov r4, s26 +; CHECK-NEXT: vmov.16 q5[2], r4 +; CHECK-NEXT: vmov r4, s27 +; CHECK-NEXT: vmov.16 q5[3], r4 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.16 q5[4], r4 +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.16 q5[6], r4 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: vpt.i16 ne, q5, zr +; CHECK-NEXT: vldrht.u16 q6, [r0], #16 +; CHECK-NEXT: vmov.u16 r4, q6[0] ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrht.u16 q4, [r1], #16 -; CHECK-NEXT: vmov.u16 r4, q4[0] -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: vmov.u16 r4, q4[1] -; CHECK-NEXT: vmov.32 q1[1], r4 -; CHECK-NEXT: vmov.u16 r4, q4[2] -; CHECK-NEXT: vmov.32 q1[2], r4 -; CHECK-NEXT: vmov.u16 r4, q4[3] -; CHECK-NEXT: vmov.32 q1[3], r4 -; CHECK-NEXT: vmullb.s16 q1, q1, q7 -; CHECK-NEXT: vshr.s32 q1, q1, #15 -; CHECK-NEXT: vmax.s32 q1, q1, q5 -; CHECK-NEXT: vmin.s32 q1, q1, q6 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov.16 q7[0], r4 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vmov.16 q7[1], r4 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov.16 q7[2], r4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: vmov.16 q7[3], r4 -; CHECK-NEXT: vmov.u16 r4, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: vmov.u16 r4, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r4 -; CHECK-NEXT: vmov.u16 r4, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r4 -; CHECK-NEXT: vmov.u16 r4, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r4 -; CHECK-NEXT: vmov.u16 r4, q4[4] +; CHECK-NEXT: vldrht.u16 q7, [r1], #16 +; CHECK-NEXT: vmov.32 q5[0], r4 +; CHECK-NEXT: vmov.u16 r4, q6[1] +; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov.u16 r4, q6[2] +; CHECK-NEXT: vmov.32 q5[2], r4 +; CHECK-NEXT: vmov.u16 r4, q6[3] +; CHECK-NEXT: vmov.32 q5[3], r4 +; CHECK-NEXT: vmov.u16 r4, q7[0] ; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov.u16 r4, q4[5] +; CHECK-NEXT: vmov.u16 r4, q7[1] ; CHECK-NEXT: vmov.32 q0[1], r4 -; CHECK-NEXT: vmov.u16 r4, q4[6] +; CHECK-NEXT: vmov.u16 r4, q7[2] ; CHECK-NEXT: vmov.32 q0[2], r4 -; CHECK-NEXT: vmov.u16 r4, q4[7] +; CHECK-NEXT: vmov.u16 r4, q7[3] ; CHECK-NEXT: vmov.32 q0[3], r4 -; CHECK-NEXT: vmullb.s16 q0, q0, q1 +; CHECK-NEXT: vmullb.s16 q0, q0, q5 ; CHECK-NEXT: vshr.s32 q0, q0, #15 -; CHECK-NEXT: vmax.s32 q0, q0, q5 -; CHECK-NEXT: vmin.s32 q0, q0, q6 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov.16 q7[4], r4 +; CHECK-NEXT: vmov.16 q5[0], r4 ; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov.16 q7[5], r4 +; CHECK-NEXT: vmov.16 q5[1], r4 ; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov.16 q7[6], r4 +; CHECK-NEXT: vmov.16 q5[2], r4 ; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: vmov.16 q7[7], r4 +; CHECK-NEXT: vmov.16 q5[3], r4 +; CHECK-NEXT: vmov.u16 r4, q6[4] +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.u16 r4, q6[5] +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov.u16 r4, q6[6] +; CHECK-NEXT: vmov.32 q0[2], r4 +; CHECK-NEXT: vmov.u16 r4, q6[7] +; CHECK-NEXT: vmov.32 q0[3], r4 +; CHECK-NEXT: vmov.u16 r4, q7[4] +; CHECK-NEXT: vmov.32 q6[0], r4 +; CHECK-NEXT: vmov.u16 r4, q7[5] +; CHECK-NEXT: vmov.32 q6[1], r4 +; CHECK-NEXT: vmov.u16 r4, q7[6] +; CHECK-NEXT: vmov.32 q6[2], r4 +; CHECK-NEXT: vmov.u16 r4, q7[7] +; CHECK-NEXT: vmov.32 q6[3], r4 +; CHECK-NEXT: vmullb.s16 q0, q6, q0 +; CHECK-NEXT: vshr.s32 q0, q0, #15 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.16 q5[4], r4 +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.16 q5[6], r4 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov.16 q5[7], r4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q7, [r2], #16 +; CHECK-NEXT: vstrht.16 q5, [r2], #16 ; CHECK-NEXT: le lr, .LBB9_2 ; CHECK-NEXT: .LBB9_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #56 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 @@ -1837,8 +1816,6 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB10_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -1852,60 +1829,53 @@ ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: add.w lr, lr, r12, lsr #3 ; CHECK-NEXT: sub.w r12, r3, #1 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vldrw.u32 q4, [r4] ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vmvn.i32 q5, #0x7fff -; CHECK-NEXT: vmov.i32 q6, #0x7fff -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: vdup.32 q6, r3 ; CHECK-NEXT: adds r3, #8 -; CHECK-NEXT: vorr q7, q0, q4 -; CHECK-NEXT: vcmp.u32 cs, q1, q7 -; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov.16 q7[0], r4 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: vmov.16 q7[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: vmov.16 q7[2], r4 -; CHECK-NEXT: vmov r4, s19 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q7[3], r4 -; CHECK-NEXT: vorr q0, q0, q4 -; CHECK-NEXT: vcmp.u32 cs, q1, q0 -; CHECK-NEXT: vpsel q0, q3, q2 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov.16 q7[4], r4 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov.16 q7[5], r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov.16 q7[6], r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: vmov.16 q7[7], r4 -; CHECK-NEXT: vptt.i16 ne, q7, zr -; CHECK-NEXT: vldrht.u16 q0, [r0], #16 -; CHECK-NEXT: vldrht.u16 q4, [r1], #16 -; CHECK-NEXT: vmullt.s16 q7, q4, q0 -; CHECK-NEXT: vmullb.s16 q0, q4, q0 +; CHECK-NEXT: vorr q5, q6, q0 +; CHECK-NEXT: vorr q6, q6, q4 +; CHECK-NEXT: vcmp.u32 cs, q1, q5 +; CHECK-NEXT: vpsel q7, q3, q2 +; CHECK-NEXT: vcmp.u32 cs, q1, q6 +; CHECK-NEXT: vmov r4, s28 +; CHECK-NEXT: vpsel q6, q3, q2 +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov r4, s29 +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov r4, s30 +; CHECK-NEXT: vmov.16 q5[2], r4 +; CHECK-NEXT: vmov r4, s31 +; CHECK-NEXT: vmov.16 q5[3], r4 +; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vmov.16 q5[4], r4 +; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: vmov r4, s26 +; CHECK-NEXT: vmov.16 q5[6], r4 +; CHECK-NEXT: vmov r4, s27 +; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: vptt.i16 ne, q5, zr +; CHECK-NEXT: vldrht.u16 q5, [r0], #16 +; CHECK-NEXT: vldrht.u16 q6, [r1], #16 +; CHECK-NEXT: vmullt.s16 q7, q6, q5 +; CHECK-NEXT: vmullb.s16 q5, q6, q5 ; CHECK-NEXT: vshr.s32 q7, q7, #15 -; CHECK-NEXT: vshr.s32 q0, q0, #15 -; CHECK-NEXT: vmax.s32 q7, q7, q5 -; CHECK-NEXT: vmax.s32 q0, q0, q5 -; CHECK-NEXT: vmin.s32 q7, q7, q6 -; CHECK-NEXT: vmin.s32 q0, q0, q6 -; CHECK-NEXT: vmovnt.i32 q0, q7 +; CHECK-NEXT: vshr.s32 q5, q5, #15 +; CHECK-NEXT: vqmovnb.s32 q7, q7 +; CHECK-NEXT: vqmovnb.s32 q5, q5 +; CHECK-NEXT: vmovlb.s16 q7, q7 +; CHECK-NEXT: vmovlb.s16 q5, q5 +; CHECK-NEXT: vmovnt.i32 q5, q7 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q0, [r2], #16 +; CHECK-NEXT: vstrht.16 q5, [r2], #16 ; CHECK-NEXT: le lr, .LBB10_2 ; CHECK-NEXT: .LBB10_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #40 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 @@ -1997,7 +1967,6 @@ ; CHECK-NEXT: bic r5, r3, #3 ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: subs r6, r5, #4 -; CHECK-NEXT: vmov.i32 q0, #0xffff ; CHECK-NEXT: add.w r12, r0, r5, lsl #1 ; CHECK-NEXT: add.w lr, r4, r6, lsr #2 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1 @@ -2005,12 +1974,12 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB11_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q1, [r0], #8 -; CHECK-NEXT: vldrh.u32 q2, [r1], #8 -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vshr.u32 q1, q1, #15 -; CHECK-NEXT: vmin.u32 q1, q1, q0 -; CHECK-NEXT: vstrh.32 q1, [r2], #8 +; CHECK-NEXT: vldrh.u32 q0, [r0], #8 +; CHECK-NEXT: vldrh.u32 q1, [r1], #8 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vshr.u32 q0, q0, #15 +; CHECK-NEXT: vqmovnb.u32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r2], #8 ; CHECK-NEXT: le lr, .LBB11_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 @@ -2128,7 +2097,6 @@ ; CHECK-NEXT: bic r5, r3, #7 ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: sub.w r6, r5, #8 -; CHECK-NEXT: vmov.i32 q0, #0xffff ; CHECK-NEXT: add.w r12, r0, r5, lsl #1 ; CHECK-NEXT: add.w lr, r4, r6, lsr #3 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1 @@ -2136,18 +2104,18 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB12_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q1, [r0, #8] -; CHECK-NEXT: vldrh.u32 q2, [r1, #8] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.u32 q2, [r1], #16 -; CHECK-NEXT: vshr.u32 q1, q1, #15 -; CHECK-NEXT: vmin.u32 q1, q1, q0 -; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vldrh.u32 q1, [r0], #16 -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vshr.u32 q1, q1, #15 -; CHECK-NEXT: vmin.u32 q1, q1, q0 -; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: vldrh.u32 q0, [r0, #8] +; CHECK-NEXT: vldrh.u32 q1, [r1, #8] +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1], #16 +; CHECK-NEXT: vshr.u32 q0, q0, #15 +; CHECK-NEXT: vqmovnb.u32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r2, #8] +; CHECK-NEXT: vldrh.u32 q0, [r0], #16 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vshr.u32 q0, q0, #15 +; CHECK-NEXT: vqmovnb.u32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB12_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 @@ -2408,21 +2376,18 @@ ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: sub.w r6, r5, #8 ; CHECK-NEXT: add.w r12, r0, r5 -; CHECK-NEXT: vmvn.i16 q0, #0x7f -; CHECK-NEXT: vmov.i16 q1, #0x7f ; CHECK-NEXT: add.w lr, r4, r6, lsr #3 ; CHECK-NEXT: adds r4, r2, r5 ; CHECK-NEXT: adds r6, r1, r5 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB14_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.s16 q2, [r0], #8 -; CHECK-NEXT: vldrb.s16 q3, [r1], #8 -; CHECK-NEXT: vmul.i16 q2, q3, q2 -; CHECK-NEXT: vshr.s16 q2, q2, #7 -; CHECK-NEXT: vmax.s16 q2, q2, q0 -; CHECK-NEXT: vmin.s16 q2, q2, q1 -; CHECK-NEXT: vstrb.16 q2, [r2], #8 +; CHECK-NEXT: vldrb.s16 q0, [r0], #8 +; CHECK-NEXT: vldrb.s16 q1, [r1], #8 +; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vshr.s16 q0, q0, #7 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vstrb.16 q0, [r2], #8 ; CHECK-NEXT: le lr, .LBB14_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 @@ -2547,28 +2512,24 @@ ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: sub.w r6, r5, #16 ; CHECK-NEXT: add.w r12, r0, r5 -; CHECK-NEXT: vmvn.i16 q0, #0x7f -; CHECK-NEXT: vmov.i16 q1, #0x7f ; CHECK-NEXT: add.w lr, r4, r6, lsr #4 ; CHECK-NEXT: adds r4, r2, r5 ; CHECK-NEXT: adds r6, r1, r5 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB15_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.s16 q2, [r0, #8] -; CHECK-NEXT: vldrb.s16 q3, [r1, #8] -; CHECK-NEXT: vmul.i16 q2, q3, q2 -; CHECK-NEXT: vldrb.s16 q3, [r1], #16 -; CHECK-NEXT: vshr.s16 q2, q2, #7 -; CHECK-NEXT: vmax.s16 q2, q2, q0 -; CHECK-NEXT: vmin.s16 q2, q2, q1 -; CHECK-NEXT: vstrb.16 q2, [r2, #8] -; CHECK-NEXT: vldrb.s16 q2, [r0], #16 -; CHECK-NEXT: vmul.i16 q2, q3, q2 -; CHECK-NEXT: vshr.s16 q2, q2, #7 -; CHECK-NEXT: vmax.s16 q2, q2, q0 -; CHECK-NEXT: vmin.s16 q2, q2, q1 -; CHECK-NEXT: vstrb.16 q2, [r2], #16 +; CHECK-NEXT: vldrb.s16 q0, [r0, #8] +; CHECK-NEXT: vldrb.s16 q1, [r1, #8] +; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vldrb.s16 q1, [r1], #16 +; CHECK-NEXT: vshr.s16 q0, q0, #7 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vstrb.16 q0, [r2, #8] +; CHECK-NEXT: vldrb.s16 q0, [r0], #16 +; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vshr.s16 q0, q0, #7 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vstrb.16 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB15_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 @@ -2677,8 +2638,6 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB16_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -2695,30 +2654,29 @@ ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: sub.w r6, r5, #16 ; CHECK-NEXT: add.w r12, r0, r5 -; CHECK-NEXT: vmvn.i16 q0, #0x7f -; CHECK-NEXT: vmov.i16 q1, #0x7f ; CHECK-NEXT: add.w lr, r4, r6, lsr #4 ; CHECK-NEXT: adds r4, r2, r5 ; CHECK-NEXT: adds r6, r1, r5 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB16_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u8 q2, [r0], #16 -; CHECK-NEXT: vldrb.u8 q3, [r1], #16 -; CHECK-NEXT: vmullt.s8 q4, q3, q2 -; CHECK-NEXT: vmullb.s8 q2, q3, q2 -; CHECK-NEXT: vshr.s16 q4, q4, #7 +; CHECK-NEXT: vldrb.u8 q0, [r0], #16 +; CHECK-NEXT: vldrb.u8 q1, [r1], #16 +; CHECK-NEXT: vmullt.s8 q2, q1, q0 +; CHECK-NEXT: vmullb.s8 q0, q1, q0 ; CHECK-NEXT: vshr.s16 q2, q2, #7 -; CHECK-NEXT: vmax.s16 q4, q4, q0 -; CHECK-NEXT: vmax.s16 q2, q2, q0 -; CHECK-NEXT: vmin.s16 q4, q4, q1 -; CHECK-NEXT: vmin.s16 q2, q2, q1 -; CHECK-NEXT: vmovnt.i16 q2, q4 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vshr.s16 q0, q0, #7 +; CHECK-NEXT: vqmovnb.s16 q2, q2 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB16_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: beq .LBB16_8 +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB16_6: @ %for.body.preheader23 ; CHECK-NEXT: sub.w lr, r3, r5 ; CHECK-NEXT: mvn r0, #127 @@ -2738,7 +2696,6 @@ ; CHECK-NEXT: strb r2, [r4], #1 ; CHECK-NEXT: le lr, .LBB16_7 ; CHECK-NEXT: .LBB16_8: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %cmp10 = icmp eq i32 %N, 0 @@ -2838,8 +2795,6 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB17_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -2853,55 +2808,48 @@ ; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: add.w lr, lr, r12, lsr #3 ; CHECK-NEXT: sub.w r12, r3, #1 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vldrw.u32 q4, [r4] ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vmvn.i16 q5, #0x7f -; CHECK-NEXT: vmov.i16 q6, #0x7f -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB17_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: vdup.32 q6, r3 ; CHECK-NEXT: adds r3, #8 -; CHECK-NEXT: vorr q7, q0, q4 -; CHECK-NEXT: vcmp.u32 cs, q1, q7 -; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov.16 q7[0], r4 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: vmov.16 q7[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: vmov.16 q7[2], r4 -; CHECK-NEXT: vmov r4, s19 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q7[3], r4 -; CHECK-NEXT: vorr q0, q0, q4 -; CHECK-NEXT: vcmp.u32 cs, q1, q0 -; CHECK-NEXT: vpsel q0, q3, q2 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov.16 q7[4], r4 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov.16 q7[5], r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov.16 q7[6], r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: vmov.16 q7[7], r4 -; CHECK-NEXT: vptt.i16 ne, q7, zr -; CHECK-NEXT: vldrbt.s16 q0, [r0], #8 -; CHECK-NEXT: vldrbt.s16 q4, [r1], #8 -; CHECK-NEXT: vmul.i16 q0, q4, q0 -; CHECK-NEXT: vshr.s16 q0, q0, #7 -; CHECK-NEXT: vmax.s16 q0, q0, q5 -; CHECK-NEXT: vmin.s16 q0, q0, q6 +; CHECK-NEXT: vorr q5, q6, q0 +; CHECK-NEXT: vorr q6, q6, q4 +; CHECK-NEXT: vcmp.u32 cs, q1, q5 +; CHECK-NEXT: vpsel q7, q3, q2 +; CHECK-NEXT: vcmp.u32 cs, q1, q6 +; CHECK-NEXT: vmov r4, s28 +; CHECK-NEXT: vpsel q6, q3, q2 +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov r4, s29 +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov r4, s30 +; CHECK-NEXT: vmov.16 q5[2], r4 +; CHECK-NEXT: vmov r4, s31 +; CHECK-NEXT: vmov.16 q5[3], r4 +; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vmov.16 q5[4], r4 +; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: vmov r4, s26 +; CHECK-NEXT: vmov.16 q5[6], r4 +; CHECK-NEXT: vmov r4, s27 +; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: vptt.i16 ne, q5, zr +; CHECK-NEXT: vldrbt.s16 q5, [r0], #8 +; CHECK-NEXT: vldrbt.s16 q6, [r1], #8 +; CHECK-NEXT: vmul.i16 q5, q6, q5 +; CHECK-NEXT: vshr.s16 q5, q5, #7 +; CHECK-NEXT: vqmovnb.s16 q5, q5 +; CHECK-NEXT: vmovlb.s8 q5, q5 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrbt.16 q0, [r2], #8 +; CHECK-NEXT: vstrbt.16 q5, [r2], #8 ; CHECK-NEXT: le lr, .LBB17_2 ; CHECK-NEXT: .LBB17_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #40 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 @@ -2967,8 +2915,8 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #120 -; CHECK-NEXT: sub sp, #120 +; CHECK-NEXT: .pad #56 +; CHECK-NEXT: sub sp, #56 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB18_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -2983,227 +2931,217 @@ ; CHECK-NEXT: add.w lr, lr, r12, lsr #4 ; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vdup.32 q0, r12 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI18_2 -; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: vdup.32 q1, r12 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI18_3 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmvn.i16 q0, #0x7f -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q0, #0x7f -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vldrw.u32 q6, [r4] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB18_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vdup.32 q5, r3 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vdup.32 q4, r3 ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: vorr q4, q5, q4 -; CHECK-NEXT: vcmp.u32 cs, q0, q4 -; CHECK-NEXT: vpsel q6, q3, q2 -; CHECK-NEXT: vmov r4, s24 -; CHECK-NEXT: vmov.16 q4[0], r4 -; CHECK-NEXT: vmov r4, s25 -; CHECK-NEXT: vmov.16 q4[1], r4 -; CHECK-NEXT: vmov r4, s26 -; CHECK-NEXT: vmov.16 q4[2], r4 -; CHECK-NEXT: vmov r4, s27 -; CHECK-NEXT: vldrw.u32 q6, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q4[3], r4 -; CHECK-NEXT: vorr q6, q5, q6 -; CHECK-NEXT: vcmp.u32 cs, q0, q6 -; CHECK-NEXT: vpsel q6, q3, q2 -; CHECK-NEXT: vmov r4, s24 -; CHECK-NEXT: vmov.16 q4[4], r4 -; CHECK-NEXT: vmov r4, s25 -; CHECK-NEXT: vmov.16 q4[5], r4 -; CHECK-NEXT: vmov r4, s26 -; CHECK-NEXT: vmov.16 q4[6], r4 -; CHECK-NEXT: vmov r4, s27 -; CHECK-NEXT: vmov.16 q4[7], r4 -; CHECK-NEXT: vcmp.i16 ne, q4, zr -; CHECK-NEXT: vpsel q6, q3, q2 -; CHECK-NEXT: vmov.u16 r4, q6[0] -; CHECK-NEXT: vmov.8 q4[0], r4 -; CHECK-NEXT: vmov.u16 r4, q6[1] -; CHECK-NEXT: vmov.8 q4[1], r4 -; CHECK-NEXT: vmov.u16 r4, q6[2] -; CHECK-NEXT: vmov.8 q4[2], r4 -; CHECK-NEXT: vmov.u16 r4, q6[3] -; CHECK-NEXT: vmov.8 q4[3], r4 -; CHECK-NEXT: vmov.u16 r4, q6[4] -; CHECK-NEXT: vmov.8 q4[4], r4 -; CHECK-NEXT: vmov.u16 r4, q6[5] -; CHECK-NEXT: vmov.8 q4[5], r4 -; CHECK-NEXT: vmov.u16 r4, q6[6] -; CHECK-NEXT: vmov.8 q4[6], r4 -; CHECK-NEXT: vmov.u16 r4, q6[7] -; CHECK-NEXT: vldrw.u32 q6, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q4[7], r4 -; CHECK-NEXT: vorr q6, q5, q6 -; CHECK-NEXT: vcmp.u32 cs, q0, q6 -; CHECK-NEXT: vpsel q7, q3, q2 -; CHECK-NEXT: vmov r4, s28 -; CHECK-NEXT: vmov.16 q6[0], r4 -; CHECK-NEXT: vmov r4, s29 -; CHECK-NEXT: vmov.16 q6[1], r4 -; CHECK-NEXT: vmov r4, s30 -; CHECK-NEXT: vmov.16 q6[2], r4 -; CHECK-NEXT: vmov r4, s31 -; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q6[3], r4 -; CHECK-NEXT: vorr q5, q5, q7 -; CHECK-NEXT: vcmp.u32 cs, q0, q5 +; CHECK-NEXT: vorr q0, q4, q0 +; CHECK-NEXT: vcmp.u32 cs, q1, q0 ; CHECK-NEXT: vpsel q5, q3, q2 ; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov.16 q6[4], r4 +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: vmov r4, s21 +; CHECK-NEXT: vmov.16 q0[1], r4 +; CHECK-NEXT: vmov r4, s22 +; CHECK-NEXT: vmov.16 q0[2], r4 +; CHECK-NEXT: vmov r4, s23 +; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q0[3], r4 +; CHECK-NEXT: vorr q5, q4, q5 +; CHECK-NEXT: vcmp.u32 cs, q1, q5 +; CHECK-NEXT: vpsel q5, q3, q2 +; CHECK-NEXT: vmov r4, s20 +; CHECK-NEXT: vmov.16 q0[4], r4 ; CHECK-NEXT: vmov r4, s21 -; CHECK-NEXT: vmov.16 q6[5], r4 +; CHECK-NEXT: vmov.16 q0[5], r4 ; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: vmov.16 q6[6], r4 +; CHECK-NEXT: vmov.16 q0[6], r4 ; CHECK-NEXT: vmov r4, s23 -; CHECK-NEXT: vmov.16 q6[7], r4 -; CHECK-NEXT: vcmp.i16 ne, q6, zr +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vpsel q5, q3, q2 ; CHECK-NEXT: vmov.u16 r4, q5[0] -; CHECK-NEXT: vmov.8 q4[8], r4 +; CHECK-NEXT: vmov.8 q0[0], r4 ; CHECK-NEXT: vmov.u16 r4, q5[1] -; CHECK-NEXT: vmov.8 q4[9], r4 +; CHECK-NEXT: vmov.8 q0[1], r4 ; CHECK-NEXT: vmov.u16 r4, q5[2] -; CHECK-NEXT: vmov.8 q4[10], r4 +; CHECK-NEXT: vmov.8 q0[2], r4 ; CHECK-NEXT: vmov.u16 r4, q5[3] -; CHECK-NEXT: vmov.8 q4[11], r4 +; CHECK-NEXT: vmov.8 q0[3], r4 ; CHECK-NEXT: vmov.u16 r4, q5[4] -; CHECK-NEXT: vmov.8 q4[12], r4 +; CHECK-NEXT: vmov.8 q0[4], r4 ; CHECK-NEXT: vmov.u16 r4, q5[5] -; CHECK-NEXT: vmov.8 q4[13], r4 +; CHECK-NEXT: vmov.8 q0[5], r4 ; CHECK-NEXT: vmov.u16 r4, q5[6] -; CHECK-NEXT: vmov.8 q4[14], r4 +; CHECK-NEXT: vmov.8 q0[6], r4 ; CHECK-NEXT: vmov.u16 r4, q5[7] -; CHECK-NEXT: vmov.8 q4[15], r4 -; CHECK-NEXT: vpt.i8 ne, q4, zr -; CHECK-NEXT: vldrbt.u8 q5, [r0], #16 -; CHECK-NEXT: vmov.u8 r4, q5[0] -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u8 q6, [r1], #16 -; CHECK-NEXT: vmov.16 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q5[1] -; CHECK-NEXT: vmov.16 q4[1], r4 -; CHECK-NEXT: vmov.u8 r4, q5[2] -; CHECK-NEXT: vmov.16 q4[2], r4 -; CHECK-NEXT: vmov.u8 r4, q5[3] -; CHECK-NEXT: vmov.16 q4[3], r4 -; CHECK-NEXT: vmov.u8 r4, q5[4] -; CHECK-NEXT: vmov.16 q4[4], r4 -; CHECK-NEXT: vmov.u8 r4, q5[5] -; CHECK-NEXT: vmov.16 q4[5], r4 -; CHECK-NEXT: vmov.u8 r4, q5[6] -; CHECK-NEXT: vmov.16 q4[6], r4 -; CHECK-NEXT: vmov.u8 r4, q5[7] -; CHECK-NEXT: vmov.16 q4[7], r4 -; CHECK-NEXT: vmov.u8 r4, q6[0] -; CHECK-NEXT: vmov.16 q7[0], r4 -; CHECK-NEXT: vmov.u8 r4, q6[1] -; CHECK-NEXT: vmov.16 q7[1], r4 -; CHECK-NEXT: vmov.u8 r4, q6[2] -; CHECK-NEXT: vmov.16 q7[2], r4 -; CHECK-NEXT: vmov.u8 r4, q6[3] -; CHECK-NEXT: vmov.16 q7[3], r4 -; CHECK-NEXT: vmov.u8 r4, q6[4] -; CHECK-NEXT: vmov.16 q7[4], r4 -; CHECK-NEXT: vmov.u8 r4, q6[5] -; CHECK-NEXT: vmov.16 q7[5], r4 -; CHECK-NEXT: vmov.u8 r4, q6[6] -; CHECK-NEXT: vmov.16 q7[6], r4 -; CHECK-NEXT: vmov.u8 r4, q6[7] -; CHECK-NEXT: vmov.16 q7[7], r4 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmullb.s8 q4, q7, q4 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vshr.s16 q4, q4, #7 -; CHECK-NEXT: vmax.s16 q4, q4, q0 -; CHECK-NEXT: vmin.s16 q7, q4, q1 -; CHECK-NEXT: vmov.u16 r4, q7[0] -; CHECK-NEXT: vmov.8 q4[0], r4 -; CHECK-NEXT: vmov.u16 r4, q7[1] -; CHECK-NEXT: vmov.8 q4[1], r4 -; CHECK-NEXT: vmov.u16 r4, q7[2] -; CHECK-NEXT: vmov.8 q4[2], r4 -; CHECK-NEXT: vmov.u16 r4, q7[3] -; CHECK-NEXT: vmov.8 q4[3], r4 -; CHECK-NEXT: vmov.u16 r4, q7[4] -; CHECK-NEXT: vmov.8 q4[4], r4 -; CHECK-NEXT: vmov.u16 r4, q7[5] -; CHECK-NEXT: vmov.8 q4[5], r4 -; CHECK-NEXT: vmov.u16 r4, q7[6] -; CHECK-NEXT: vmov.8 q4[6], r4 -; CHECK-NEXT: vmov.u16 r4, q7[7] -; CHECK-NEXT: vmov.8 q4[7], r4 -; CHECK-NEXT: vmov.u8 r4, q5[8] +; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q0[7], r4 +; CHECK-NEXT: vorr q5, q4, q5 +; CHECK-NEXT: vorr q4, q4, q6 +; CHECK-NEXT: vcmp.u32 cs, q1, q5 +; CHECK-NEXT: vpsel q7, q3, q2 +; CHECK-NEXT: vcmp.u32 cs, q1, q4 +; CHECK-NEXT: vmov r4, s28 +; CHECK-NEXT: vpsel q4, q3, q2 +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov r4, s29 +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov r4, s30 +; CHECK-NEXT: vmov.16 q5[2], r4 +; CHECK-NEXT: vmov r4, s31 +; CHECK-NEXT: vmov.16 q5[3], r4 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov.16 q5[4], r4 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: vmov.16 q5[6], r4 +; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: vcmp.i16 ne, q5, zr +; CHECK-NEXT: vpsel q4, q3, q2 +; CHECK-NEXT: vmov.u16 r4, q4[0] +; CHECK-NEXT: vmov.8 q0[8], r4 +; CHECK-NEXT: vmov.u16 r4, q4[1] +; CHECK-NEXT: vmov.8 q0[9], r4 +; CHECK-NEXT: vmov.u16 r4, q4[2] +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.u16 r4, q4[3] +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.u16 r4, q4[4] +; CHECK-NEXT: vmov.8 q0[12], r4 +; CHECK-NEXT: vmov.u16 r4, q4[5] +; CHECK-NEXT: vmov.8 q0[13], r4 +; CHECK-NEXT: vmov.u16 r4, q4[6] +; CHECK-NEXT: vmov.8 q0[14], r4 +; CHECK-NEXT: vmov.u16 r4, q4[7] +; CHECK-NEXT: vmov.8 q0[15], r4 +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0], #16 +; CHECK-NEXT: vmov.u8 r4, q0[0] ; CHECK-NEXT: vmov.16 q7[0], r4 -; CHECK-NEXT: vmov.u8 r4, q5[9] +; CHECK-NEXT: vmov.u8 r4, q0[1] ; CHECK-NEXT: vmov.16 q7[1], r4 -; CHECK-NEXT: vmov.u8 r4, q5[10] +; CHECK-NEXT: vmov.u8 r4, q0[2] ; CHECK-NEXT: vmov.16 q7[2], r4 -; CHECK-NEXT: vmov.u8 r4, q5[11] +; CHECK-NEXT: vmov.u8 r4, q0[3] ; CHECK-NEXT: vmov.16 q7[3], r4 -; CHECK-NEXT: vmov.u8 r4, q5[12] +; CHECK-NEXT: vmov.u8 r4, q0[4] ; CHECK-NEXT: vmov.16 q7[4], r4 -; CHECK-NEXT: vmov.u8 r4, q5[13] +; CHECK-NEXT: vmov.u8 r4, q0[5] ; CHECK-NEXT: vmov.16 q7[5], r4 -; CHECK-NEXT: vmov.u8 r4, q5[14] +; CHECK-NEXT: vmov.u8 r4, q0[6] ; CHECK-NEXT: vmov.16 q7[6], r4 -; CHECK-NEXT: vmov.u8 r4, q5[15] +; CHECK-NEXT: vmov.u8 r4, q0[7] ; CHECK-NEXT: vmov.16 q7[7], r4 -; CHECK-NEXT: vmov.u8 r4, q6[8] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u8 q4, [r1], #16 +; CHECK-NEXT: vmov.u8 r4, q4[0] ; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vmov.u8 r4, q6[9] +; CHECK-NEXT: vmov.u8 r4, q4[1] ; CHECK-NEXT: vmov.16 q5[1], r4 -; CHECK-NEXT: vmov.u8 r4, q6[10] +; CHECK-NEXT: vmov.u8 r4, q4[2] ; CHECK-NEXT: vmov.16 q5[2], r4 -; CHECK-NEXT: vmov.u8 r4, q6[11] +; CHECK-NEXT: vmov.u8 r4, q4[3] ; CHECK-NEXT: vmov.16 q5[3], r4 -; CHECK-NEXT: vmov.u8 r4, q6[12] +; CHECK-NEXT: vmov.u8 r4, q4[4] ; CHECK-NEXT: vmov.16 q5[4], r4 -; CHECK-NEXT: vmov.u8 r4, q6[13] +; CHECK-NEXT: vmov.u8 r4, q4[5] ; CHECK-NEXT: vmov.16 q5[5], r4 -; CHECK-NEXT: vmov.u8 r4, q6[14] +; CHECK-NEXT: vmov.u8 r4, q4[6] ; CHECK-NEXT: vmov.16 q5[6], r4 -; CHECK-NEXT: vmov.u8 r4, q6[15] +; CHECK-NEXT: vmov.u8 r4, q4[7] ; CHECK-NEXT: vmov.16 q5[7], r4 ; CHECK-NEXT: vmullb.s8 q5, q5, q7 ; CHECK-NEXT: vshr.s16 q5, q5, #7 -; CHECK-NEXT: vmax.s16 q5, q5, q0 -; CHECK-NEXT: vmin.s16 q5, q5, q1 +; CHECK-NEXT: vqmovnb.s16 q5, q5 +; CHECK-NEXT: vmovlb.s8 q5, q5 ; CHECK-NEXT: vmov.u16 r4, q5[0] -; CHECK-NEXT: vmov.8 q4[8], r4 +; CHECK-NEXT: vmov.8 q7[0], r4 ; CHECK-NEXT: vmov.u16 r4, q5[1] -; CHECK-NEXT: vmov.8 q4[9], r4 +; CHECK-NEXT: vmov.8 q7[1], r4 ; CHECK-NEXT: vmov.u16 r4, q5[2] -; CHECK-NEXT: vmov.8 q4[10], r4 +; CHECK-NEXT: vmov.8 q7[2], r4 ; CHECK-NEXT: vmov.u16 r4, q5[3] -; CHECK-NEXT: vmov.8 q4[11], r4 +; CHECK-NEXT: vmov.8 q7[3], r4 ; CHECK-NEXT: vmov.u16 r4, q5[4] -; CHECK-NEXT: vmov.8 q4[12], r4 +; CHECK-NEXT: vmov.8 q7[4], r4 ; CHECK-NEXT: vmov.u16 r4, q5[5] -; CHECK-NEXT: vmov.8 q4[13], r4 +; CHECK-NEXT: vmov.8 q7[5], r4 ; CHECK-NEXT: vmov.u16 r4, q5[6] -; CHECK-NEXT: vmov.8 q4[14], r4 +; CHECK-NEXT: vmov.8 q7[6], r4 ; CHECK-NEXT: vmov.u16 r4, q5[7] -; CHECK-NEXT: vmov.8 q4[15], r4 +; CHECK-NEXT: vmov.8 q7[7], r4 +; CHECK-NEXT: vmov.u8 r4, q0[8] +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov.u8 r4, q0[9] +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov.u8 r4, q0[10] +; CHECK-NEXT: vmov.16 q5[2], r4 +; CHECK-NEXT: vmov.u8 r4, q0[11] +; CHECK-NEXT: vmov.16 q5[3], r4 +; CHECK-NEXT: vmov.u8 r4, q0[12] +; CHECK-NEXT: vmov.16 q5[4], r4 +; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: vmov.u8 r4, q0[14] +; CHECK-NEXT: vmov.16 q5[6], r4 +; CHECK-NEXT: vmov.u8 r4, q0[15] +; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: vmov.u8 r4, q4[8] +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: vmov.u8 r4, q4[9] +; CHECK-NEXT: vmov.16 q0[1], r4 +; CHECK-NEXT: vmov.u8 r4, q4[10] +; CHECK-NEXT: vmov.16 q0[2], r4 +; CHECK-NEXT: vmov.u8 r4, q4[11] +; CHECK-NEXT: vmov.16 q0[3], r4 +; CHECK-NEXT: vmov.u8 r4, q4[12] +; CHECK-NEXT: vmov.16 q0[4], r4 +; CHECK-NEXT: vmov.u8 r4, q4[13] +; CHECK-NEXT: vmov.16 q0[5], r4 +; CHECK-NEXT: vmov.u8 r4, q4[14] +; CHECK-NEXT: vmov.16 q0[6], r4 +; CHECK-NEXT: vmov.u8 r4, q4[15] +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmullb.s8 q0, q0, q5 +; CHECK-NEXT: vshr.s16 q0, q0, #7 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmov.u16 r4, q0[0] +; CHECK-NEXT: vmov.8 q7[8], r4 +; CHECK-NEXT: vmov.u16 r4, q0[1] +; CHECK-NEXT: vmov.8 q7[9], r4 +; CHECK-NEXT: vmov.u16 r4, q0[2] +; CHECK-NEXT: vmov.8 q7[10], r4 +; CHECK-NEXT: vmov.u16 r4, q0[3] +; CHECK-NEXT: vmov.8 q7[11], r4 +; CHECK-NEXT: vmov.u16 r4, q0[4] +; CHECK-NEXT: vmov.8 q7[12], r4 +; CHECK-NEXT: vmov.u16 r4, q0[5] +; CHECK-NEXT: vmov.8 q7[13], r4 +; CHECK-NEXT: vmov.u16 r4, q0[6] +; CHECK-NEXT: vmov.8 q7[14], r4 +; CHECK-NEXT: vmov.u16 r4, q0[7] +; CHECK-NEXT: vmov.8 q7[15], r4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrbt.8 q4, [r2], #16 +; CHECK-NEXT: vstrbt.8 q7, [r2], #16 ; CHECK-NEXT: le lr, .LBB18_2 ; CHECK-NEXT: .LBB18_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #120 +; CHECK-NEXT: add sp, #56 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 @@ -3279,8 +3217,8 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: .pad #56 +; CHECK-NEXT: sub sp, #56 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB19_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -3295,127 +3233,121 @@ ; CHECK-NEXT: add.w lr, lr, r12, lsr #4 ; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI19_2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI19_3 ; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmvn.i16 q0, #0x7f +; CHECK-NEXT: vldrw.u32 q6, [r4] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q0, #0x7f ; CHECK-NEXT: .LBB19_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vdup.32 q5, r3 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: vorr q4, q5, q4 +; CHECK-NEXT: vorr q4, q0, q4 ; CHECK-NEXT: vcmp.u32 cs, q1, q4 -; CHECK-NEXT: vpsel q6, q3, q2 -; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vpsel q4, q3, q2 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov.16 q7[0], r4 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov.16 q7[1], r4 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: vmov.16 q7[2], r4 +; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q7[3], r4 +; CHECK-NEXT: vorr q4, q0, q4 +; CHECK-NEXT: vcmp.u32 cs, q1, q4 +; CHECK-NEXT: vpsel q4, q3, q2 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov.16 q7[4], r4 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov.16 q7[5], r4 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: vmov.16 q7[6], r4 +; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: vmov.16 q7[7], r4 +; CHECK-NEXT: vcmp.i16 ne, q7, zr +; CHECK-NEXT: vpsel q4, q3, q2 +; CHECK-NEXT: vmov.u16 r4, q4[0] +; CHECK-NEXT: vmov.8 q7[0], r4 +; CHECK-NEXT: vmov.u16 r4, q4[1] +; CHECK-NEXT: vmov.8 q7[1], r4 +; CHECK-NEXT: vmov.u16 r4, q4[2] +; CHECK-NEXT: vmov.8 q7[2], r4 +; CHECK-NEXT: vmov.u16 r4, q4[3] +; CHECK-NEXT: vmov.8 q7[3], r4 +; CHECK-NEXT: vmov.u16 r4, q4[4] +; CHECK-NEXT: vmov.8 q7[4], r4 +; CHECK-NEXT: vmov.u16 r4, q4[5] +; CHECK-NEXT: vmov.8 q7[5], r4 +; CHECK-NEXT: vmov.u16 r4, q4[6] +; CHECK-NEXT: vmov.8 q7[6], r4 +; CHECK-NEXT: vmov.u16 r4, q4[7] +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q7[7], r4 +; CHECK-NEXT: vorr q4, q0, q4 +; CHECK-NEXT: vorr q0, q0, q6 +; CHECK-NEXT: vcmp.u32 cs, q1, q4 +; CHECK-NEXT: vpsel q5, q3, q2 +; CHECK-NEXT: vcmp.u32 cs, q1, q0 +; CHECK-NEXT: vmov r4, s20 +; CHECK-NEXT: vpsel q0, q3, q2 ; CHECK-NEXT: vmov.16 q4[0], r4 -; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmov r4, s21 ; CHECK-NEXT: vmov.16 q4[1], r4 -; CHECK-NEXT: vmov r4, s26 +; CHECK-NEXT: vmov r4, s22 ; CHECK-NEXT: vmov.16 q4[2], r4 -; CHECK-NEXT: vmov r4, s27 -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov r4, s23 ; CHECK-NEXT: vmov.16 q4[3], r4 -; CHECK-NEXT: vorr q6, q5, q6 -; CHECK-NEXT: vcmp.u32 cs, q1, q6 -; CHECK-NEXT: vpsel q6, q3, q2 -; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vmov.16 q4[4], r4 -; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmov r4, s1 ; CHECK-NEXT: vmov.16 q4[5], r4 -; CHECK-NEXT: vmov r4, s26 +; CHECK-NEXT: vmov r4, s2 ; CHECK-NEXT: vmov.16 q4[6], r4 -; CHECK-NEXT: vmov r4, s27 +; CHECK-NEXT: vmov r4, s3 ; CHECK-NEXT: vmov.16 q4[7], r4 ; CHECK-NEXT: vcmp.i16 ne, q4, zr -; CHECK-NEXT: vpsel q6, q3, q2 -; CHECK-NEXT: vmov.u16 r4, q6[0] -; CHECK-NEXT: vmov.8 q4[0], r4 -; CHECK-NEXT: vmov.u16 r4, q6[1] -; CHECK-NEXT: vmov.8 q4[1], r4 -; CHECK-NEXT: vmov.u16 r4, q6[2] -; CHECK-NEXT: vmov.8 q4[2], r4 -; CHECK-NEXT: vmov.u16 r4, q6[3] -; CHECK-NEXT: vmov.8 q4[3], r4 -; CHECK-NEXT: vmov.u16 r4, q6[4] -; CHECK-NEXT: vmov.8 q4[4], r4 -; CHECK-NEXT: vmov.u16 r4, q6[5] -; CHECK-NEXT: vmov.8 q4[5], r4 -; CHECK-NEXT: vmov.u16 r4, q6[6] -; CHECK-NEXT: vmov.8 q4[6], r4 -; CHECK-NEXT: vmov.u16 r4, q6[7] -; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q4[7], r4 -; CHECK-NEXT: vorr q6, q5, q6 -; CHECK-NEXT: vcmp.u32 cs, q1, q6 -; CHECK-NEXT: vpsel q7, q3, q2 -; CHECK-NEXT: vmov r4, s28 -; CHECK-NEXT: vmov.16 q6[0], r4 -; CHECK-NEXT: vmov r4, s29 -; CHECK-NEXT: vmov.16 q6[1], r4 -; CHECK-NEXT: vmov r4, s30 -; CHECK-NEXT: vmov.16 q6[2], r4 -; CHECK-NEXT: vmov r4, s31 -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q6[3], r4 -; CHECK-NEXT: vorr q5, q5, q7 -; CHECK-NEXT: vcmp.u32 cs, q1, q5 -; CHECK-NEXT: vpsel q5, q3, q2 -; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov.16 q6[4], r4 -; CHECK-NEXT: vmov r4, s21 -; CHECK-NEXT: vmov.16 q6[5], r4 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: vmov.16 q6[6], r4 -; CHECK-NEXT: vmov r4, s23 -; CHECK-NEXT: vmov.16 q6[7], r4 -; CHECK-NEXT: vcmp.i16 ne, q6, zr -; CHECK-NEXT: vpsel q5, q3, q2 -; CHECK-NEXT: vmov.u16 r4, q5[0] -; CHECK-NEXT: vmov.8 q4[8], r4 -; CHECK-NEXT: vmov.u16 r4, q5[1] -; CHECK-NEXT: vmov.8 q4[9], r4 -; CHECK-NEXT: vmov.u16 r4, q5[2] -; CHECK-NEXT: vmov.8 q4[10], r4 -; CHECK-NEXT: vmov.u16 r4, q5[3] -; CHECK-NEXT: vmov.8 q4[11], r4 -; CHECK-NEXT: vmov.u16 r4, q5[4] -; CHECK-NEXT: vmov.8 q4[12], r4 -; CHECK-NEXT: vmov.u16 r4, q5[5] -; CHECK-NEXT: vmov.8 q4[13], r4 -; CHECK-NEXT: vmov.u16 r4, q5[6] -; CHECK-NEXT: vmov.8 q4[14], r4 -; CHECK-NEXT: vmov.u16 r4, q5[7] -; CHECK-NEXT: vmov.8 q4[15], r4 -; CHECK-NEXT: vptt.i8 ne, q4, zr -; CHECK-NEXT: vldrbt.u8 q4, [r0], #16 -; CHECK-NEXT: vldrbt.u8 q5, [r1], #16 -; CHECK-NEXT: vmullt.s8 q6, q5, q4 -; CHECK-NEXT: vmullb.s8 q4, q5, q4 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vshr.s16 q6, q6, #7 -; CHECK-NEXT: vshr.s16 q4, q4, #7 -; CHECK-NEXT: vmax.s16 q6, q6, q7 -; CHECK-NEXT: vmax.s16 q4, q4, q7 -; CHECK-NEXT: vmin.s16 q6, q6, q0 -; CHECK-NEXT: vmin.s16 q4, q4, q0 -; CHECK-NEXT: vmovnt.i16 q4, q6 +; CHECK-NEXT: vpsel q0, q3, q2 +; CHECK-NEXT: vmov.u16 r4, q0[0] +; CHECK-NEXT: vmov.8 q7[8], r4 +; CHECK-NEXT: vmov.u16 r4, q0[1] +; CHECK-NEXT: vmov.8 q7[9], r4 +; CHECK-NEXT: vmov.u16 r4, q0[2] +; CHECK-NEXT: vmov.8 q7[10], r4 +; CHECK-NEXT: vmov.u16 r4, q0[3] +; CHECK-NEXT: vmov.8 q7[11], r4 +; CHECK-NEXT: vmov.u16 r4, q0[4] +; CHECK-NEXT: vmov.8 q7[12], r4 +; CHECK-NEXT: vmov.u16 r4, q0[5] +; CHECK-NEXT: vmov.8 q7[13], r4 +; CHECK-NEXT: vmov.u16 r4, q0[6] +; CHECK-NEXT: vmov.8 q7[14], r4 +; CHECK-NEXT: vmov.u16 r4, q0[7] +; CHECK-NEXT: vmov.8 q7[15], r4 +; CHECK-NEXT: vptt.i8 ne, q7, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0], #16 +; CHECK-NEXT: vldrbt.u8 q4, [r1], #16 +; CHECK-NEXT: vmullt.s8 q5, q4, q0 +; CHECK-NEXT: vmullb.s8 q0, q4, q0 +; CHECK-NEXT: vshr.s16 q5, q5, #7 +; CHECK-NEXT: vshr.s16 q0, q0, #7 +; CHECK-NEXT: vqmovnb.s16 q5, q5 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s8 q5, q5 +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmovnt.i16 q0, q5 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrbt.8 q4, [r2], #16 +; CHECK-NEXT: vstrbt.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB19_2 ; CHECK-NEXT: .LBB19_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: add sp, #56 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 @@ -3518,19 +3450,18 @@ ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: sub.w r6, r5, #8 ; CHECK-NEXT: add.w r12, r0, r5 -; CHECK-NEXT: vmov.i16 q0, #0xff ; CHECK-NEXT: add.w lr, r4, r6, lsr #3 ; CHECK-NEXT: adds r4, r2, r5 ; CHECK-NEXT: adds r6, r1, r5 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB20_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q1, [r0], #8 -; CHECK-NEXT: vldrb.u16 q2, [r1], #8 -; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vshr.u16 q1, q1, #7 -; CHECK-NEXT: vmin.u16 q1, q1, q0 -; CHECK-NEXT: vstrb.16 q1, [r2], #8 +; CHECK-NEXT: vldrb.u16 q0, [r0], #8 +; CHECK-NEXT: vldrb.u16 q1, [r1], #8 +; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vshr.u16 q0, q0, #7 +; CHECK-NEXT: vqmovnb.u16 q0, q0 +; CHECK-NEXT: vstrb.16 q0, [r2], #8 ; CHECK-NEXT: le lr, .LBB20_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 @@ -3648,25 +3579,26 @@ ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: sub.w r6, r5, #16 ; CHECK-NEXT: add.w r12, r0, r5 -; CHECK-NEXT: vmov.i16 q0, #0xff ; CHECK-NEXT: add.w lr, r4, r6, lsr #4 ; CHECK-NEXT: adds r4, r2, r5 ; CHECK-NEXT: adds r6, r1, r5 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB21_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q1, [r0, #8] -; CHECK-NEXT: vldrb.u16 q2, [r1, #8] -; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vldrb.u16 q2, [r1], #16 -; CHECK-NEXT: vshr.u16 q1, q1, #7 -; CHECK-NEXT: vmin.u16 q1, q1, q0 -; CHECK-NEXT: vstrb.16 q1, [r2, #8] -; CHECK-NEXT: vldrb.u16 q1, [r0], #16 -; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vshr.u16 q1, q1, #7 -; CHECK-NEXT: vmin.u16 q1, q1, q0 -; CHECK-NEXT: vstrb.16 q1, [r2], #16 +; CHECK-NEXT: vldrb.u16 q0, [r0, #8] +; CHECK-NEXT: vldrb.u16 q1, [r1, #8] +; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vldrb.u16 q1, [r1], #16 +; CHECK-NEXT: vshr.u16 q0, q0, #7 +; CHECK-NEXT: vqmovnb.u16 q0, q0 +; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vstrb.16 q0, [r2, #8] +; CHECK-NEXT: vldrb.u16 q0, [r0], #16 +; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vshr.u16 q0, q0, #7 +; CHECK-NEXT: vqmovnb.u16 q0, q0 +; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vstrb.16 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB21_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r5, r3 diff --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn-combine.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn-combine.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqmovn-combine.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn-combine.ll @@ -4,10 +4,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni32_sminmax_t1(<4 x i32> %s0, <8 x i16> %src1) { ; CHECK-LABEL: vqmovni32_sminmax_t1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmvn.i32 q2, #0x7fff -; CHECK-NEXT: vmax.s32 q0, q0, q2 -; CHECK-NEXT: vmov.i32 q2, #0x7fff -; CHECK-NEXT: vmin.s32 q0, q0, q2 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmovnt.i32 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -24,10 +22,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni32_sminmax_t2(<4 x i32> %s0, <8 x i16> %src1) { ; CHECK-LABEL: vqmovni32_sminmax_t2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmvn.i32 q2, #0x7fff -; CHECK-NEXT: vmax.s32 q0, q0, q2 -; CHECK-NEXT: vmov.i32 q2, #0x7fff -; CHECK-NEXT: vmin.s32 q0, q0, q2 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmovnt.i32 q0, q1 ; CHECK-NEXT: bx lr entry: @@ -43,10 +39,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni32_sminmax_b1(<4 x i32> %s0, <8 x i16> %src1) { ; CHECK-LABEL: vqmovni32_sminmax_b1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmvn.i32 q2, #0x7fff -; CHECK-NEXT: vmax.s32 q0, q0, q2 -; CHECK-NEXT: vmov.i32 q2, #0x7fff -; CHECK-NEXT: vmin.s32 q0, q0, q2 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmovnb.i32 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -63,10 +57,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni32_sminmax_b2(<4 x i32> %s0, <8 x i16> %src1) { ; CHECK-LABEL: vqmovni32_sminmax_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmvn.i32 q2, #0x7fff -; CHECK-NEXT: vmax.s32 q0, q0, q2 -; CHECK-NEXT: vmov.i32 q2, #0x7fff -; CHECK-NEXT: vmin.s32 q0, q0, q2 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmovnb.i32 q0, q1 ; CHECK-NEXT: bx lr entry: @@ -83,8 +75,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni32_uminmax_t1(<4 x i32> %s0, <8 x i16> %src1) { ; CHECK-LABEL: vqmovni32_uminmax_t1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0xffff -; CHECK-NEXT: vmin.u32 q0, q0, q2 +; CHECK-NEXT: vqmovnb.u32 q0, q0 +; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vmovnt.i32 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -99,8 +91,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni32_uminmax_t2(<4 x i32> %s0, <8 x i16> %src1) { ; CHECK-LABEL: vqmovni32_uminmax_t2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0xffff -; CHECK-NEXT: vmin.u32 q0, q0, q2 +; CHECK-NEXT: vqmovnb.u32 q0, q0 +; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vmovnt.i32 q0, q1 ; CHECK-NEXT: bx lr entry: @@ -114,8 +106,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni32_uminmax_b1(<4 x i32> %s0, <8 x i16> %src1) { ; CHECK-LABEL: vqmovni32_uminmax_b1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0xffff -; CHECK-NEXT: vmin.u32 q0, q0, q2 +; CHECK-NEXT: vqmovnb.u32 q0, q0 +; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vmovnb.i32 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -145,10 +137,8 @@ define arm_aapcs_vfpcc <16 x i8> @vqmovni16_sminmax_t1(<8 x i16> %s0, <16 x i8> %src1) { ; CHECK-LABEL: vqmovni16_sminmax_t1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmvn.i16 q2, #0x7f -; CHECK-NEXT: vmax.s16 q0, q0, q2 -; CHECK-NEXT: vmov.i16 q2, #0x7f -; CHECK-NEXT: vmin.s16 q0, q0, q2 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovnt.i16 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -165,10 +155,8 @@ define arm_aapcs_vfpcc <16 x i8> @vqmovni16_sminmax_t2(<8 x i16> %s0, <16 x i8> %src1) { ; CHECK-LABEL: vqmovni16_sminmax_t2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmvn.i16 q2, #0x7f -; CHECK-NEXT: vmax.s16 q0, q0, q2 -; CHECK-NEXT: vmov.i16 q2, #0x7f -; CHECK-NEXT: vmin.s16 q0, q0, q2 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovnt.i16 q0, q1 ; CHECK-NEXT: bx lr entry: @@ -184,10 +172,8 @@ define arm_aapcs_vfpcc <16 x i8> @vqmovni16_sminmax_b1(<8 x i16> %s0, <16 x i8> %src1) { ; CHECK-LABEL: vqmovni16_sminmax_b1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmvn.i16 q2, #0x7f -; CHECK-NEXT: vmax.s16 q0, q0, q2 -; CHECK-NEXT: vmov.i16 q2, #0x7f -; CHECK-NEXT: vmin.s16 q0, q0, q2 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovnb.i16 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -204,10 +190,8 @@ define arm_aapcs_vfpcc <16 x i8> @vqmovni16_sminmax_b2(<8 x i16> %s0, <16 x i8> %src1) { ; CHECK-LABEL: vqmovni16_sminmax_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmvn.i16 q2, #0x7f -; CHECK-NEXT: vmax.s16 q0, q0, q2 -; CHECK-NEXT: vmov.i16 q2, #0x7f -; CHECK-NEXT: vmin.s16 q0, q0, q2 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovnb.i16 q0, q1 ; CHECK-NEXT: bx lr entry: @@ -224,8 +208,8 @@ define arm_aapcs_vfpcc <16 x i8> @vqmovni16_uminmax_t1(<8 x i16> %s0, <16 x i8> %src1) { ; CHECK-LABEL: vqmovni16_uminmax_t1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i16 q2, #0xff -; CHECK-NEXT: vmin.u16 q0, q0, q2 +; CHECK-NEXT: vqmovnb.u16 q0, q0 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vmovnt.i16 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -240,8 +224,8 @@ define arm_aapcs_vfpcc <16 x i8> @vqmovni16_uminmax_t2(<8 x i16> %s0, <16 x i8> %src1) { ; CHECK-LABEL: vqmovni16_uminmax_t2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i16 q2, #0xff -; CHECK-NEXT: vmin.u16 q0, q0, q2 +; CHECK-NEXT: vqmovnb.u16 q0, q0 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vmovnt.i16 q0, q1 ; CHECK-NEXT: bx lr entry: @@ -255,8 +239,8 @@ define arm_aapcs_vfpcc <16 x i8> @vqmovni16_uminmax_b1(<8 x i16> %s0, <16 x i8> %src1) { ; CHECK-LABEL: vqmovni16_uminmax_b1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i16 q2, #0xff -; CHECK-NEXT: vmin.u16 q0, q0, q2 +; CHECK-NEXT: vqmovnb.u16 q0, q0 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vmovnb.i16 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll @@ -4,10 +4,8 @@ define arm_aapcs_vfpcc <4 x i32> @vqmovni32_smaxmin(<4 x i32> %s0) { ; CHECK-LABEL: vqmovni32_smaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x7fff -; CHECK-NEXT: vmin.s32 q0, q0, q1 -; CHECK-NEXT: vmvn.i32 q1, #0x7fff -; CHECK-NEXT: vmax.s32 q0, q0, q1 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp slt <4 x i32> %s0, @@ -20,10 +18,8 @@ define arm_aapcs_vfpcc <4 x i32> @vqmovni32_sminmax(<4 x i32> %s0) { ; CHECK-LABEL: vqmovni32_sminmax: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmvn.i32 q1, #0x7fff -; CHECK-NEXT: vmax.s32 q0, q0, q1 -; CHECK-NEXT: vmov.i32 q1, #0x7fff -; CHECK-NEXT: vmin.s32 q0, q0, q1 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp sgt <4 x i32> %s0, @@ -36,8 +32,8 @@ define arm_aapcs_vfpcc <4 x i32> @vqmovni32_umaxmin(<4 x i32> %s0) { ; CHECK-LABEL: vqmovni32_umaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0xffff -; CHECK-NEXT: vmin.u32 q0, q0, q1 +; CHECK-NEXT: vqmovnb.u32 q0, q0 +; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp ult <4 x i32> %s0, @@ -48,8 +44,8 @@ define arm_aapcs_vfpcc <4 x i32> @vqmovni32_uminmax(<4 x i32> %s0) { ; CHECK-LABEL: vqmovni32_uminmax: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0xffff -; CHECK-NEXT: vmin.u32 q0, q0, q1 +; CHECK-NEXT: vqmovnb.u32 q0, q0 +; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: bx lr entry: %c2 = icmp ult <4 x i32> %s0, @@ -60,10 +56,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni16_smaxmin(<8 x i16> %s0) { ; CHECK-LABEL: vqmovni16_smaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i16 q1, #0x7f -; CHECK-NEXT: vmin.s16 q0, q0, q1 -; CHECK-NEXT: vmvn.i16 q1, #0x7f -; CHECK-NEXT: vmax.s16 q0, q0, q1 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp slt <8 x i16> %s0, @@ -76,10 +70,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni16_sminmax(<8 x i16> %s0) { ; CHECK-LABEL: vqmovni16_sminmax: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmvn.i16 q1, #0x7f -; CHECK-NEXT: vmax.s16 q0, q0, q1 -; CHECK-NEXT: vmov.i16 q1, #0x7f -; CHECK-NEXT: vmin.s16 q0, q0, q1 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp sgt <8 x i16> %s0, @@ -92,8 +84,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni16_umaxmin(<8 x i16> %s0) { ; CHECK-LABEL: vqmovni16_umaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i16 q1, #0xff -; CHECK-NEXT: vmin.u16 q0, q0, q1 +; CHECK-NEXT: vqmovnb.u16 q0, q0 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp ult <8 x i16> %s0, @@ -104,8 +96,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni16_uminmax(<8 x i16> %s0) { ; CHECK-LABEL: vqmovni16_uminmax: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i16 q1, #0xff -; CHECK-NEXT: vmin.u16 q0, q0, q1 +; CHECK-NEXT: vqmovnb.u16 q0, q0 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: bx lr entry: %c2 = icmp ult <8 x i16> %s0, diff --git a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll @@ -5,10 +5,8 @@ ; CHECK-LABEL: vqshrni32_smaxmin: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vshr.s32 q0, q0, #3 -; CHECK-NEXT: vmov.i32 q1, #0x7fff -; CHECK-NEXT: vmin.s32 q0, q0, q1 -; CHECK-NEXT: vmvn.i32 q1, #0x7fff -; CHECK-NEXT: vmax.s32 q0, q0, q1 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: bx lr entry: %s0 = ashr <4 x i32> %so, @@ -23,10 +21,8 @@ ; CHECK-LABEL: vqshrni32_sminmax: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vshr.s32 q0, q0, #3 -; CHECK-NEXT: vmvn.i32 q1, #0x7fff -; CHECK-NEXT: vmax.s32 q0, q0, q1 -; CHECK-NEXT: vmov.i32 q1, #0x7fff -; CHECK-NEXT: vmin.s32 q0, q0, q1 +; CHECK-NEXT: vqmovnb.s32 q0, q0 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: bx lr entry: %s0 = ashr <4 x i32> %so, @@ -41,8 +37,8 @@ ; CHECK-LABEL: vqshrni32_umaxmin: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vshr.u32 q0, q0, #3 -; CHECK-NEXT: vmov.i32 q1, #0xffff -; CHECK-NEXT: vmin.u32 q0, q0, q1 +; CHECK-NEXT: vqmovnb.u32 q0, q0 +; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: bx lr entry: %s0 = lshr <4 x i32> %so, @@ -55,8 +51,8 @@ ; CHECK-LABEL: vqshrni32_uminmax: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vshr.u32 q0, q0, #3 -; CHECK-NEXT: vmov.i32 q1, #0xffff -; CHECK-NEXT: vmin.u32 q0, q0, q1 +; CHECK-NEXT: vqmovnb.u32 q0, q0 +; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: bx lr entry: %s0 = lshr <4 x i32> %so, @@ -69,10 +65,8 @@ ; CHECK-LABEL: vqshrni16_smaxmin: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vshr.s16 q0, q0, #3 -; CHECK-NEXT: vmov.i16 q1, #0x7f -; CHECK-NEXT: vmin.s16 q0, q0, q1 -; CHECK-NEXT: vmvn.i16 q1, #0x7f -; CHECK-NEXT: vmax.s16 q0, q0, q1 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: bx lr entry: %s0 = ashr <8 x i16> %so, @@ -87,10 +81,8 @@ ; CHECK-LABEL: vqshrni16_sminmax: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vshr.s16 q0, q0, #3 -; CHECK-NEXT: vmvn.i16 q1, #0x7f -; CHECK-NEXT: vmax.s16 q0, q0, q1 -; CHECK-NEXT: vmov.i16 q1, #0x7f -; CHECK-NEXT: vmin.s16 q0, q0, q1 +; CHECK-NEXT: vqmovnb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: bx lr entry: %s0 = ashr <8 x i16> %so, @@ -105,8 +97,8 @@ ; CHECK-LABEL: vqshrni16_umaxmin: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vshr.u16 q0, q0, #3 -; CHECK-NEXT: vmov.i16 q1, #0xff -; CHECK-NEXT: vmin.u16 q0, q0, q1 +; CHECK-NEXT: vqmovnb.u16 q0, q0 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: bx lr entry: %s0 = lshr <8 x i16> %so, @@ -119,8 +111,8 @@ ; CHECK-LABEL: vqshrni16_uminmax: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vshr.u16 q0, q0, #3 -; CHECK-NEXT: vmov.i16 q1, #0xff -; CHECK-NEXT: vmin.u16 q0, q0, q1 +; CHECK-NEXT: vqmovnb.u16 q0, q0 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: bx lr entry: %s0 = lshr <8 x i16> %so,