diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -216,6 +216,8 @@ VMULLs, // ...signed VMULLu, // ...unsigned + VQDMULH, // MVE vqdmulh instruction + // MVE reductions VADDVs, // sign- or zero-extend the elements of a vector to i32, VADDVu, // add them all together, and return an i32 of their sum diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1718,6 +1718,7 @@ case ARMISD::VCVTL: return "ARMISD::VCVTL"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::VQDMULH: return "ARMISD::VQDMULH"; case ARMISD::VADDVs: return "ARMISD::VADDVs"; case ARMISD::VADDVu: return "ARMISD::VADDVu"; case ARMISD::VADDVps: return "ARMISD::VADDVps"; @@ -12206,9 +12207,93 @@ return Reduction; } +// A special combine for the vqdmulh family of instructions. This is one of the +// potential set of patterns that could patch this instruction. The base pattern +// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))). +// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))), +// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as +// the max is unnecessary. +static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDValue Shft; + ConstantSDNode *Clamp; + + if (N->getOpcode() == ISD::SMIN) { + Shft = N->getOperand(0); + Clamp = isConstOrConstSplat(N->getOperand(1)); + } else if (N->getOpcode() == ISD::VSELECT) { + // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin. + SDValue Cmp = N->getOperand(0); + if (Cmp.getOpcode() != ISD::SETCC || + cast(Cmp.getOperand(2))->get() != ISD::SETLT || + Cmp.getOperand(0) != N->getOperand(1) || + Cmp.getOperand(1) != N->getOperand(2)) + return SDValue(); + Shft = N->getOperand(1); + Clamp = isConstOrConstSplat(N->getOperand(2)); + } else + return SDValue(); + + if (!Clamp) + return SDValue(); + + MVT ScalarType; + int ShftAmt = 0; + switch (Clamp->getSExtValue()) { + case (1 << 7) - 1: + ScalarType = MVT::i8; + ShftAmt = 7; + break; + case (1 << 15) - 1: + ScalarType = MVT::i16; + ShftAmt = 15; + break; + case (1ULL << 31) - 1: + ScalarType = MVT::i32; + ShftAmt = 31; + break; + default: + return SDValue(); + } + + if (Shft.getOpcode() != ISD::SRA) + return SDValue(); + ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1)); + if (!N1 || N1->getSExtValue() != ShftAmt) + return SDValue(); + + SDValue Mul = Shft.getOperand(0); + if (Mul.getOpcode() != ISD::MUL) + return SDValue(); + + SDValue Ext0 = Mul.getOperand(0); + SDValue Ext1 = Mul.getOperand(1); + if (Ext0.getOpcode() != ISD::SIGN_EXTEND || + Ext1.getOpcode() != ISD::SIGN_EXTEND) + return SDValue(); + EVT VecVT = Ext0.getOperand(0).getValueType(); + if (VecVT != MVT::v4i32 && VecVT != MVT::v8i16 && VecVT != MVT::v16i8) + return SDValue(); + if (Ext1.getOperand(0).getValueType() != VecVT || + VecVT.getScalarType() != ScalarType || + VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2) + return SDValue(); + + SDLoc DL(Mul); + SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, VecVT, Ext0.getOperand(0), + Ext1.getOperand(0)); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, VQDMULH); +} + static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { + if (!Subtarget->hasMVEIntegerOps()) + return SDValue(); + + if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG)) + return V; + // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs). // // We need to re-implement this optimization here as the implementation in the @@ -12218,9 +12303,6 @@ // // Currently, this is only done for MVE, as it's the only target that benefits // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL). - if (!Subtarget->hasMVEIntegerOps()) - return SDValue(); - if (N->getOperand(0).getOpcode() != ISD::XOR) return SDValue(); SDValue XOR = N->getOperand(0); @@ -14582,6 +14664,14 @@ return true; }; + // It may be preferable to keep the store unsplit as the trunc may end up + // being removed. Check that here. + if (Trunc.getOperand(0).getOpcode() == ISD::SMIN) { + if (SDValue U = PerformVQDMULHCombine(Trunc.getOperand(0).getNode(), DAG)) { + DAG.ReplaceAllUsesWith(Trunc.getOperand(0), U); + return SDValue(); + } + } if (auto *Shuffle = dyn_cast(Trunc->getOperand(0))) if (isVMOVNOriginalMask(Shuffle->getMask(), false) || isVMOVNOriginalMask(Shuffle->getMask(), true)) @@ -15555,6 +15645,9 @@ if (!ST->hasMVEIntegerOps()) return SDValue(); + if (SDValue V = PerformVQDMULHCombine(N, DAG)) + return V; + if (VT != MVT::v4i32 && VT != MVT::v8i16) return SDValue(); diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -1955,28 +1955,26 @@ let validForTailPredication = 1; } +def MVEvqdmulh : SDNode<"ARMISD::VQDMULH", SDTIntBinOp>; + multiclass MVE_VQxDMULH_m { def "" : MVE_VQxDMULH_Base; defvar Inst = !cast(NAME); + defm : MVE_TwoOpPattern; let Predicates = [HasMVEInt] in { - // Unpredicated multiply - def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + // Extra unpredicated multiply intrinsic patterns + def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; - - // Predicated multiply - def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - ARMVCCThen, (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$inactive)))>; } } multiclass MVE_VQxDMULH - : MVE_VQxDMULH_m { + PatFrag Op, Intrinsic int_unpred, Intrinsic int_pred> { def "" : MVE_VxxMUL_qr; - defm : MVE_vec_scalar_int_pat_m(NAME), VTI, - int_unpred, int_pred>; + defm : MVE_TwoOpPatternDup(NAME)>; + defm : MVE_vec_scalar_int_pat_m(NAME), VTI, int_unpred, int_pred>; } multiclass MVE_VQDMULH_qr_m : - MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0, + MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0, MVEvqdmulh, int_arm_mve_vqdmulh, int_arm_mve_qdmulh_predicated>; multiclass MVE_VQRDMULH_qr_m : - MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1, + MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1, null_frag, int_arm_mve_vqrdmulh, int_arm_mve_qrdmulh_predicated>; defm MVE_VQDMULH_qr_s8 : MVE_VQDMULH_qr_m; diff --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll @@ -4,106 +4,8 @@ define arm_aapcs_vfpcc i32 @vqdmulh_i8(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: vqdmulh_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmovlb.s8 q2, q2 -; CHECK-NEXT: vmovlb.s8 q3, q3 -; CHECK-NEXT: vmovlb.s8 q4, q4 -; CHECK-NEXT: vmovlb.s8 q5, q5 -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vmovlb.s16 q4, q4 -; CHECK-NEXT: vmovlb.s16 q5, q5 -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vmul.i32 q4, q5, q4 -; CHECK-NEXT: vshr.s32 q3, q2, #7 -; CHECK-NEXT: vmov.i32 q2, #0x7f -; CHECK-NEXT: vshr.s32 q4, q4, #7 -; CHECK-NEXT: vmin.s32 q3, q3, q2 -; CHECK-NEXT: vmin.s32 q4, q4, q2 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vadd.i32 q3, q4, q3 -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmovlb.s8 q4, q4 -; CHECK-NEXT: vmovlb.s8 q5, q5 -; CHECK-NEXT: vmovlb.s16 q4, q4 -; CHECK-NEXT: vmovlb.s16 q5, q5 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmul.i32 q4, q5, q4 -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vshr.s32 q4, q4, #7 -; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmovlb.s8 q0, q5 -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmin.s32 q4, q4, q2 -; CHECK-NEXT: vmovlb.s8 q1, q5 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vshr.s32 q0, q0, #7 -; CHECK-NEXT: vmin.s32 q0, q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q3 -; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 +; CHECK-NEXT: vaddv.s8 r0, q0 ; CHECK-NEXT: bx lr entry: %l2 = sext <16 x i8> %s0 to <16 x i32> @@ -119,135 +21,7 @@ define arm_aapcs_vfpcc <16 x i8> @vqdmulh_i8_b(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: vqdmulh_i8_b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s8 q3, q3 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vmul.i32 q0, q3, q0 -; CHECK-NEXT: vmov.i32 q3, #0x7f -; CHECK-NEXT: vshr.s32 q0, q0, #7 -; CHECK-NEXT: vmin.s32 q4, q0, q3 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmovlb.s8 q4, q4 -; CHECK-NEXT: vmovlb.s8 q5, q5 -; CHECK-NEXT: vmovlb.s16 q4, q4 -; CHECK-NEXT: vmovlb.s16 q5, q5 -; CHECK-NEXT: vmul.i32 q4, q5, q4 -; CHECK-NEXT: vshr.s32 q4, q4, #7 -; CHECK-NEXT: vmin.s32 q4, q4, q3 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.32 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.32 q5[3], r0 -; CHECK-NEXT: vmovlb.s8 q4, q4 -; CHECK-NEXT: vmovlb.s8 q5, q5 -; CHECK-NEXT: vmovlb.s16 q4, q4 -; CHECK-NEXT: vmovlb.s16 q5, q5 -; CHECK-NEXT: vmul.i32 q4, q5, q4 -; CHECK-NEXT: vshr.s32 q4, q4, #7 -; CHECK-NEXT: vmin.s32 q4, q4, q3 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmovlb.s8 q2, q4 -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmovlb.s8 q1, q4 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmul.i32 q1, q1, q2 -; CHECK-NEXT: vshr.s32 q1, q1, #7 -; CHECK-NEXT: vmin.s32 q1, q1, q3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.8 q0[15], r0 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 ; CHECK-NEXT: bx lr entry: %l2 = sext <16 x i8> %s0 to <16 x i32> @@ -263,50 +37,8 @@ define arm_aapcs_vfpcc i32 @vqdmulh_i16(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vqdmulh_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmullb.s16 q2, q3, q2 -; CHECK-NEXT: vmullb.s16 q0, q0, q4 -; CHECK-NEXT: vshr.s32 q3, q2, #15 -; CHECK-NEXT: vmov.i32 q2, #0x7fff -; CHECK-NEXT: vshr.s32 q0, q0, #15 -; CHECK-NEXT: vmin.s32 q3, q3, q2 -; CHECK-NEXT: vmin.s32 q0, q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q3 -; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 +; CHECK-NEXT: vaddv.s16 r0, q0 ; CHECK-NEXT: bx lr entry: %l2 = sext <8 x i16> %s0 to <8 x i32> @@ -322,65 +54,7 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_b(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vqdmulh_i16_b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmullb.s16 q0, q3, q0 -; CHECK-NEXT: vmov.i32 q3, #0x7fff -; CHECK-NEXT: vshr.s32 q0, q0, #15 -; CHECK-NEXT: vmin.s32 q4, q0, q3 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmullb.s16 q1, q2, q4 -; CHECK-NEXT: vshr.s32 q1, q1, #15 -; CHECK-NEXT: vmin.s32 q1, q1, q3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 ; CHECK-NEXT: bx lr entry: %l2 = sext <8 x i16> %s0 to <8 x i32> @@ -474,109 +148,9 @@ define arm_aapcs_vfpcc i64 @vqdmulh_i32(<4 x i32> %s0, <4 x i32> %s1) { ; CHECK-LABEL: vqdmulh_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: mvn r12, #-2147483648 -; CHECK-NEXT: vmov.f32 s16, s4 -; CHECK-NEXT: vmov.f32 s18, s5 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vmov r7, s18 -; CHECK-NEXT: smull r2, r3, r1, r0 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: asrl r2, r3, #31 -; CHECK-NEXT: subs.w r1, r2, r12 -; CHECK-NEXT: vmov.32 q5[0], r2 -; CHECK-NEXT: sbcs r1, r3, #0 -; CHECK-NEXT: vmov.32 q5[1], r3 -; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.32 q3[1], r1 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: smull r4, r1, r7, r1 -; CHECK-NEXT: asrl r4, r1, #31 -; CHECK-NEXT: subs.w r7, r4, r12 -; CHECK-NEXT: vmov.32 q5[2], r4 -; CHECK-NEXT: sbcs r7, r1, #0 -; CHECK-NEXT: vmov.32 q5[3], r1 -; CHECK-NEXT: mov.w r7, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csetm r7, ne -; CHECK-NEXT: vmov.32 q3[2], r7 -; CHECK-NEXT: vmov.32 q3[3], r7 -; CHECK-NEXT: adr r7, .LCPI5_0 -; CHECK-NEXT: vldrw.u32 q2, [r7] -; CHECK-NEXT: vbic q4, q2, q3 -; CHECK-NEXT: vand q3, q5, q3 -; CHECK-NEXT: vorr q3, q3, q4 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: vmov r7, s12 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmullb.s32 q1, q0, q3 -; CHECK-NEXT: vmov r5, s5 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: adds.w lr, r7, r1 -; CHECK-NEXT: adcs r3, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: asrl r2, r5, #31 -; CHECK-NEXT: subs.w r7, r2, r12 -; CHECK-NEXT: sbcs r7, r5, #0 -; CHECK-NEXT: mov.w r7, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csetm r7, ne -; CHECK-NEXT: vmov.32 q0[0], r7 -; CHECK-NEXT: vmov.32 q0[1], r7 -; CHECK-NEXT: vmov r7, s7 -; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: subs.w r1, r4, r12 -; CHECK-NEXT: sbcs r1, r7, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vbic q1, q2, q0 -; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r5 -; CHECK-NEXT: vmov.32 q2[2], r4 -; CHECK-NEXT: vmov.32 q2[3], r7 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r2, r3, r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI5_0: -; CHECK-NEXT: .long 2147483647 @ 0x7fffffff -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2147483647 @ 0x7fffffff -; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 +; CHECK-NEXT: vaddlv.s32 r0, r1, q0 +; CHECK-NEXT: bx lr entry: %l2 = sext <4 x i32> %s0 to <4 x i64> %l5 = sext <4 x i32> %s1 to <4 x i64> @@ -591,90 +165,8 @@ define arm_aapcs_vfpcc <4 x i32> @vqdmulh_i32_b(<4 x i32> %s0, <4 x i32> %s1) { ; CHECK-LABEL: vqdmulh_i32_b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r5, lr} -; CHECK-NEXT: push {r5, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: mvn r12, #-2147483648 -; CHECK-NEXT: vmov.f32 s16, s6 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmullb.s32 q3, q4, q2 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: asrl r2, r5, #31 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: subs.w r0, r2, r12 -; CHECK-NEXT: vmov.32 q5[0], r2 -; CHECK-NEXT: sbcs r0, r5, #0 -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: mov.w r0, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: asrl r0, r5, #31 -; CHECK-NEXT: subs.w r1, r0, r12 -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: sbcs r1, r5, #0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: adr r1, .LCPI6_0 -; CHECK-NEXT: vldrw.u32 q3, [r1] -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vbic q4, q3, q2 -; CHECK-NEXT: vand q2, q5, q2 -; CHECK-NEXT: vorr q2, q2, q4 -; CHECK-NEXT: smull r2, r1, r1, r0 -; CHECK-NEXT: asrl r2, r1, #31 -; CHECK-NEXT: subs.w r0, r2, r12 -; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: mov.w r0, #0 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: asrl r0, r1, #31 -; CHECK-NEXT: subs.w r5, r0, r12 -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: sbcs r1, r1, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov.32 q4[2], r1 -; CHECK-NEXT: vbic q0, q3, q4 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s10 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r5, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI6_0: -; CHECK-NEXT: .long 2147483647 @ 0x7fffffff -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2147483647 @ 0x7fffffff -; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 +; CHECK-NEXT: bx lr entry: %l2 = sext <4 x i32> %s0 to <4 x i64> %l5 = sext <4 x i32> %s1 to <4 x i64> @@ -695,34 +187,13 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #64 -; CHECK-NEXT: vmov.i32 q0, #0x7f ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.s32 q1, [r0, #12] -; CHECK-NEXT: vldrb.s32 q2, [r1, #12] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrb.s32 q2, [r1, #8] -; CHECK-NEXT: vshr.s32 q1, q1, #7 -; CHECK-NEXT: vmin.s32 q1, q1, q0 -; CHECK-NEXT: vstrb.32 q1, [r2, #12] -; CHECK-NEXT: vldrb.s32 q1, [r0, #8] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrb.s32 q2, [r1, #4] -; CHECK-NEXT: vshr.s32 q1, q1, #7 -; CHECK-NEXT: vmin.s32 q1, q1, q0 -; CHECK-NEXT: vstrb.32 q1, [r2, #8] -; CHECK-NEXT: vldrb.s32 q1, [r0, #4] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrb.s32 q2, [r1], #16 -; CHECK-NEXT: vshr.s32 q1, q1, #7 -; CHECK-NEXT: vmin.s32 q1, q1, q0 -; CHECK-NEXT: vstrb.32 q1, [r2, #4] -; CHECK-NEXT: vldrb.s32 q1, [r0], #16 -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vshr.s32 q1, q1, #7 -; CHECK-NEXT: vmin.s32 q1, q1, q0 -; CHECK-NEXT: vstrb.32 q1, [r2], #16 +; CHECK-NEXT: vldrb.u8 q0, [r0], #16 +; CHECK-NEXT: vldrb.u8 q1, [r1], #16 +; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -761,22 +232,13 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: vmov.i32 q0, #0x7fff ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.s32 q1, [r0, #8] -; CHECK-NEXT: vldrh.s32 q2, [r1, #8] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r1], #16 -; CHECK-NEXT: vshr.s32 q1, q1, #15 -; CHECK-NEXT: vmin.s32 q1, q1, q0 -; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vldrh.s32 q1, [r0], #16 -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vshr.s32 q1, q1, #15 -; CHECK-NEXT: vmin.s32 q1, q1, q0 -; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r1], #16 +; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -812,99 +274,19 @@ define void @vqdmulh_loop_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vqdmulh_loop_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: adr r3, .LCPI9_0 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: mvn r4, #-2147483648 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s16, s10 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmov.f32 s18, s11 -; CHECK-NEXT: vmullb.s32 q5, q4, q3 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: vmov r12, s20 -; CHECK-NEXT: asrl r12, r3, #31 -; CHECK-NEXT: vmov r6, s22 -; CHECK-NEXT: subs.w r5, r12, r4 -; CHECK-NEXT: vmov.f32 s10, s9 -; CHECK-NEXT: sbcs r3, r3, #0 -; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov.32 q3[0], r3 -; CHECK-NEXT: vmov.32 q3[1], r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: asrl r6, r3, #31 -; CHECK-NEXT: vmov.32 q5[0], r12 -; CHECK-NEXT: subs r5, r6, r4 -; CHECK-NEXT: vmov.32 q5[2], r6 -; CHECK-NEXT: sbcs r3, r3, #0 -; CHECK-NEXT: vmov r6, s8 -; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vbic q4, q0, q3 -; CHECK-NEXT: vand q3, q5, q3 -; CHECK-NEXT: vorr q3, q3, q4 -; CHECK-NEXT: smull r12, r3, r6, r3 -; CHECK-NEXT: asrl r12, r3, #31 -; CHECK-NEXT: subs.w r5, r12, r4 -; CHECK-NEXT: sbcs r3, r3, #0 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: vmov.32 q2[0], r12 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov.32 q4[0], r3 -; CHECK-NEXT: vmov.32 q4[1], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: smull r6, r3, r5, r3 -; CHECK-NEXT: asrl r6, r3, #31 -; CHECK-NEXT: subs r5, r6, r4 -; CHECK-NEXT: vmov.32 q2[2], r6 -; CHECK-NEXT: sbcs r3, r3, #0 -; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov.32 q4[2], r3 -; CHECK-NEXT: vbic q1, q0, q4 -; CHECK-NEXT: vand q2, q2, q4 -; CHECK-NEXT: vorr q1, q2, q1 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s14 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r6, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI9_0: -; CHECK-NEXT: .long 2147483647 @ 0x7fffffff -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2147483647 @ 0x7fffffff -; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: pop {r7, pc} entry: br label %vector.body