diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -296,6 +296,10 @@ setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); + setOperationAction(ISD::VECREDUCE_MUL, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); if (!HasMVEFP) { setOperationAction(ISD::SINT_TO_FP, VT, Expand); @@ -345,6 +349,10 @@ setOperationAction(ISD::FMINNUM, VT, Legal); setOperationAction(ISD::FMAXNUM, VT, Legal); setOperationAction(ISD::FROUND, VT, Legal); + setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); // No native support for these. setOperationAction(ISD::FDIV, VT, Expand); @@ -362,6 +370,17 @@ } } + // Custom Expand smaller than legal vector reductions to prevent false zero + // items being added. + setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom); + setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom); + setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom); + setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom); + setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom); + setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom); + // We 'support' these types up to bitcast/load/store level, regardless of // MVE integer-only / float support. Only doing FP data processing on the FP // vector types is inhibited at integer-only level. @@ -9498,6 +9517,79 @@ return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); } +static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (!ST->hasMVEIntegerOps()) + return SDValue(); + + SDLoc dl(Op); + unsigned BaseOpcode = 0; + switch (Op->getOpcode()) { + default: llvm_unreachable("Expected VECREDUCE opcode"); + case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break; + case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break; + case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break; + case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break; + case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break; + case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break; + case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break; + case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break; + } + + SDValue Op0 = Op->getOperand(0); + EVT VT = Op0.getValueType(); + EVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumActiveLanes = NumElts; + + assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 || + NumActiveLanes == 2) && + "Only expected a power 2 vector size"); + + // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements + // allows us to easily extract vector elements from the lanes. + while (NumActiveLanes > 4) { + unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32; + SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0); + Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev); + NumActiveLanes /= 2; + } + + SDValue Res; + if (NumActiveLanes == 4) { + // The remaining 4 elements are summed sequentially + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(0 * NumElts / 4, dl, MVT::i32)); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(1 * NumElts / 4, dl, MVT::i32)); + SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(2 * NumElts / 4, dl, MVT::i32)); + SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(3 * NumElts / 4, dl, MVT::i32)); + SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); + SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags()); + Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags()); + } else { + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(0, dl, MVT::i32)); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(1, dl, MVT::i32)); + Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); + } + + // Result type may be wider than element type. + if (EltVT != Op->getValueType(0)) + Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res); + return Res; +} + +static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (!ST->hasMVEFloatOps()) + return SDValue(); + return LowerVecReduce(Op, DAG, ST); +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { if (isStrongerThanMonotonic(cast(Op)->getOrdering())) // Acquire/Release load/store is not legal for targets without a dmb or @@ -9702,6 +9794,16 @@ return LowerSTORE(Op, DAG, Subtarget); case ISD::MLOAD: return LowerMLOAD(Op, DAG); + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + return LowerVecReduce(Op, DAG, Subtarget); + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + return LowerVecReduceF(Op, DAG, Subtarget); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll @@ -16,12 +16,12 @@ define arm_aapcs_vfpcc i32 @and_v4i32(<4 x i32> %x) { ; CHECK-LABEL: and_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: @@ -33,12 +33,12 @@ ; CHECK-LABEL: and_v8i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: @@ -49,12 +49,12 @@ define arm_aapcs_vfpcc i16 @and_v4i16(<4 x i16> %x) { ; CHECK-LABEL: and_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: @@ -65,20 +65,14 @@ define arm_aapcs_vfpcc i16 @and_v8i16(<8 x i16> %x) { ; CHECK-LABEL: and_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: @@ -90,20 +84,14 @@ ; CHECK-LABEL: and_v16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: @@ -114,20 +102,14 @@ define arm_aapcs_vfpcc i8 @and_v8i8(<8 x i8> %x) { ; CHECK-LABEL: and_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: @@ -138,36 +120,16 @@ define arm_aapcs_vfpcc i8 @and_v16i8(<16 x i8> %x) { ; CHECK-LABEL: and_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmov.u8 r1, q0[8] ; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: @@ -179,36 +141,16 @@ ; CHECK-LABEL: and_v32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmov.u8 r1, q0[8] ; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: @@ -273,12 +215,12 @@ define arm_aapcs_vfpcc i32 @and_v4i32_acc(<4 x i32> %x, i32 %y) { ; CHECK-LABEL: and_v4i32_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr @@ -292,12 +234,12 @@ ; CHECK-LABEL: and_v8i32_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr @@ -310,12 +252,12 @@ define arm_aapcs_vfpcc i16 @and_v4i16_acc(<4 x i16> %x, i16 %y) { ; CHECK-LABEL: and_v4i16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr @@ -328,20 +270,14 @@ define arm_aapcs_vfpcc i16 @and_v8i16_acc(<8 x i16> %x, i16 %y) { ; CHECK-LABEL: and_v8i16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr @@ -355,20 +291,14 @@ ; CHECK-LABEL: and_v16i16_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr @@ -381,20 +311,14 @@ define arm_aapcs_vfpcc i8 @and_v8i8_acc(<8 x i8> %x, i8 %y) { ; CHECK-LABEL: and_v8i8_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr @@ -407,36 +331,16 @@ define arm_aapcs_vfpcc i8 @and_v16i8_acc(<16 x i8> %x, i8 %y) { ; CHECK-LABEL: and_v16i8_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.u8 r1, q0[12] ; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr @@ -450,36 +354,16 @@ ; CHECK-LABEL: and_v32i8_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.u8 r1, q0[12] ; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr @@ -553,12 +437,12 @@ define arm_aapcs_vfpcc i32 @or_v4i32(<4 x i32> %x) { ; CHECK-LABEL: or_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: @@ -570,12 +454,12 @@ ; CHECK-LABEL: or_v8i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: @@ -586,12 +470,12 @@ define arm_aapcs_vfpcc i16 @or_v4i16(<4 x i16> %x) { ; CHECK-LABEL: or_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: @@ -602,20 +486,14 @@ define arm_aapcs_vfpcc i16 @or_v8i16(<8 x i16> %x) { ; CHECK-LABEL: or_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: @@ -627,20 +505,14 @@ ; CHECK-LABEL: or_v16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: @@ -651,20 +523,14 @@ define arm_aapcs_vfpcc i8 @or_v8i8(<8 x i8> %x) { ; CHECK-LABEL: or_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: @@ -675,36 +541,16 @@ define arm_aapcs_vfpcc i8 @or_v16i8(<16 x i8> %x) { ; CHECK-LABEL: or_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmov.u8 r1, q0[8] ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: @@ -716,36 +562,16 @@ ; CHECK-LABEL: or_v32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmov.u8 r1, q0[8] ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: @@ -810,12 +636,12 @@ define arm_aapcs_vfpcc i32 @or_v4i32_acc(<4 x i32> %x, i32 %y) { ; CHECK-LABEL: or_v4i32_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr @@ -829,12 +655,12 @@ ; CHECK-LABEL: or_v8i32_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr @@ -847,12 +673,12 @@ define arm_aapcs_vfpcc i16 @or_v4i16_acc(<4 x i16> %x, i16 %y) { ; CHECK-LABEL: or_v4i16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr @@ -865,20 +691,14 @@ define arm_aapcs_vfpcc i16 @or_v8i16_acc(<8 x i16> %x, i16 %y) { ; CHECK-LABEL: or_v8i16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr @@ -892,20 +712,14 @@ ; CHECK-LABEL: or_v16i16_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr @@ -918,20 +732,14 @@ define arm_aapcs_vfpcc i8 @or_v8i8_acc(<8 x i8> %x, i8 %y) { ; CHECK-LABEL: or_v8i8_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr @@ -944,36 +752,16 @@ define arm_aapcs_vfpcc i8 @or_v16i8_acc(<16 x i8> %x, i8 %y) { ; CHECK-LABEL: or_v16i8_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov.u8 r1, q0[12] ; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr @@ -987,36 +775,16 @@ ; CHECK-LABEL: or_v32i8_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov.u8 r1, q0[12] ; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr @@ -1090,12 +858,12 @@ define arm_aapcs_vfpcc i32 @xor_v4i32(<4 x i32> %x) { ; CHECK-LABEL: xor_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: @@ -1107,12 +875,12 @@ ; CHECK-LABEL: xor_v8i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: veor q0, q0, q1 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: @@ -1123,12 +891,12 @@ define arm_aapcs_vfpcc i16 @xor_v4i16(<4 x i16> %x) { ; CHECK-LABEL: xor_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: @@ -1139,20 +907,14 @@ define arm_aapcs_vfpcc i16 @xor_v8i16(<8 x i16> %x) { ; CHECK-LABEL: xor_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: @@ -1164,20 +926,14 @@ ; CHECK-LABEL: xor_v16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: veor q0, q0, q1 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: @@ -1188,20 +944,14 @@ define arm_aapcs_vfpcc i8 @xor_v8i8(<8 x i8> %x) { ; CHECK-LABEL: xor_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: @@ -1212,36 +962,16 @@ define arm_aapcs_vfpcc i8 @xor_v16i8(<16 x i8> %x) { ; CHECK-LABEL: xor_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmov.u8 r1, q0[8] ; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: @@ -1253,36 +983,16 @@ ; CHECK-LABEL: xor_v32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: veor q0, q0, q1 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmov.u8 r1, q0[8] ; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: @@ -1347,12 +1057,12 @@ define arm_aapcs_vfpcc i32 @xor_v4i32_acc(<4 x i32> %x, i32 %y) { ; CHECK-LABEL: xor_v4i32_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: eors r2, r3 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr @@ -1366,12 +1076,12 @@ ; CHECK-LABEL: xor_v8i32_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: veor q0, q0, q1 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: eors r2, r3 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr @@ -1384,12 +1094,12 @@ define arm_aapcs_vfpcc i16 @xor_v4i16_acc(<4 x i16> %x, i16 %y) { ; CHECK-LABEL: xor_v4i16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: eors r2, r3 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr @@ -1402,20 +1112,14 @@ define arm_aapcs_vfpcc i16 @xor_v8i16_acc(<8 x i16> %x, i16 %y) { ; CHECK-LABEL: xor_v8i16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: eors r2, r3 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr @@ -1429,20 +1133,14 @@ ; CHECK-LABEL: xor_v16i16_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: veor q0, q0, q1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: eors r2, r3 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr @@ -1455,20 +1153,14 @@ define arm_aapcs_vfpcc i8 @xor_v8i8_acc(<8 x i8> %x, i8 %y) { ; CHECK-LABEL: xor_v8i8_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: eors r2, r3 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr @@ -1481,36 +1173,16 @@ define arm_aapcs_vfpcc i8 @xor_v16i8_acc(<16 x i8> %x, i8 %y) { ; CHECK-LABEL: xor_v16i8_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vmov.u8 r1, q0[12] ; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: eors r2, r3 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr @@ -1524,36 +1196,16 @@ ; CHECK-LABEL: xor_v32i8_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: veor q0, q0, q1 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: vmov.u8 r1, q0[12] ; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: eors r2, r3 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll @@ -3,31 +3,51 @@ ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP define arm_aapcs_vfpcc float @fadd_v2f32(<2 x float> %x, float %y) { -; CHECK-LABEL: fadd_v2f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vldr s2, .LCPI0_0 -; CHECK-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NEXT: vadd.f32 s0, s4, s0 -; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0x00000000 @ float 0 +; CHECK-FP-LABEL: fadd_v2f32: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vadd.f32 s0, s0, s1 +; CHECK-FP-NEXT: vldr s2, .LCPI0_0 +; CHECK-FP-NEXT: vadd.f32 s0, s0, s2 +; CHECK-FP-NEXT: vadd.f32 s0, s4, s0 +; CHECK-FP-NEXT: bx lr +; CHECK-FP-NEXT: .p2align 2 +; CHECK-FP-NEXT: @ %bb.1: +; CHECK-FP-NEXT: .LCPI0_0: +; CHECK-FP-NEXT: .long 0x00000000 @ float 0 +; +; CHECK-NOFP-LABEL: fadd_v2f32: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vldr s2, .LCPI0_0 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f32 s0, s4, s0 +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 2 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI0_0: +; CHECK-NOFP-NEXT: .long 0x00000000 @ float 0 entry: %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %y, <2 x float> %x) ret float %z } define arm_aapcs_vfpcc float @fadd_v4f32(<4 x float> %x, float %y) { -; CHECK-LABEL: fadd_v4f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f32 s6, s0, s1 -; CHECK-NEXT: vadd.f32 s6, s6, s2 -; CHECK-NEXT: vadd.f32 s0, s6, s3 -; CHECK-NEXT: vadd.f32 s0, s4, s0 -; CHECK-NEXT: bx lr +; CHECK-FP-LABEL: fadd_v4f32: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vadd.f32 s6, s2, s3 +; CHECK-FP-NEXT: vadd.f32 s0, s0, s1 +; CHECK-FP-NEXT: vadd.f32 s0, s0, s6 +; CHECK-FP-NEXT: vadd.f32 s0, s4, s0 +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fadd_v4f32: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vadd.f32 s6, s0, s1 +; CHECK-NOFP-NEXT: vadd.f32 s6, s6, s2 +; CHECK-NOFP-NEXT: vadd.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vadd.f32 s0, s4, s0 +; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %y, <4 x float> %x) ret float %z @@ -37,9 +57,9 @@ ; CHECK-FP-LABEL: fadd_v8f32: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vadd.f32 q0, q0, q1 -; CHECK-FP-NEXT: vadd.f32 s4, s0, s1 -; CHECK-FP-NEXT: vadd.f32 s4, s4, s2 -; CHECK-FP-NEXT: vadd.f32 s0, s4, s3 +; CHECK-FP-NEXT: vadd.f32 s4, s2, s3 +; CHECK-FP-NEXT: vadd.f32 s0, s0, s1 +; CHECK-FP-NEXT: vadd.f32 s0, s0, s4 ; CHECK-FP-NEXT: vadd.f32 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; @@ -59,27 +79,75 @@ ret float %z } +define arm_aapcs_vfpcc void @fadd_v2f16(<2 x half> %x, half* %yy) { +; CHECK-FP-LABEL: fadd_v2f16: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vmovx.f16 s4, s0 +; CHECK-FP-NEXT: vadd.f16 s0, s0, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vadd.f16 s0, s2, s0 +; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fadd_v2f16: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI3_0 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vadd.f16 s0, s2, s0 +; CHECK-NOFP-NEXT: vstr.16 s0, [r0] +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 1 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI3_0: +; CHECK-NOFP-NEXT: .short 0x0000 @ half 0 +entry: + %y = load half, half* %yy + %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half %y, <2 x half> %x) + store half %z, half* %yy + ret void +} + define arm_aapcs_vfpcc void @fadd_v4f16(<4 x half> %x, half* %yy) { -; CHECK-LABEL: fadd_v4f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vadd.f16 s4, s0, s4 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vadd.f16 s4, s4, s1 -; CHECK-NEXT: vldr.16 s2, .LCPI3_0 -; CHECK-NEXT: vadd.f16 s0, s4, s0 -; CHECK-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NEXT: vadd.f16 s0, s0, s2 -; CHECK-NEXT: vldr.16 s2, [r0] -; CHECK-NEXT: vadd.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] -; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 1 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .short 0x0000 @ half 0 +; CHECK-FP-LABEL: fadd_v4f16: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vadd.f16 s0, s0, s6 +; CHECK-FP-NEXT: vadd.f16 s4, s1, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vadd.f16 s0, s0, s4 +; CHECK-FP-NEXT: vadd.f16 s0, s2, s0 +; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fadd_v4f16: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vadd.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI4_0 +; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vadd.f16 s0, s2, s0 +; CHECK-NOFP-NEXT: vstr.16 s0, [r0] +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 1 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI4_0: +; CHECK-NOFP-NEXT: .short 0x0000 @ half 0 entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half %y, <4 x half> %x) @@ -88,23 +156,35 @@ } define arm_aapcs_vfpcc void @fadd_v8f16(<8 x half> %x, half* %yy) { -; CHECK-LABEL: fadd_v8f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vadd.f16 s4, s0, s4 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vadd.f16 s4, s4, s1 -; CHECK-NEXT: vadd.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vadd.f16 s4, s4, s2 -; CHECK-NEXT: vldr.16 s2, [r0] -; CHECK-NEXT: vadd.f16 s4, s4, s6 -; CHECK-NEXT: vadd.f16 s4, s4, s3 -; CHECK-NEXT: vadd.f16 s0, s4, s0 -; CHECK-NEXT: vadd.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] -; CHECK-NEXT: bx lr +; CHECK-FP-LABEL: fadd_v8f16: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vadd.f16 q0, q0, q1 +; CHECK-FP-NEXT: vadd.f16 s4, s2, s3 +; CHECK-FP-NEXT: vadd.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vadd.f16 s0, s0, s4 +; CHECK-FP-NEXT: vadd.f16 s0, s2, s0 +; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fadd_v8f16: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vadd.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s2 +; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vadd.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vadd.f16 s0, s2, s0 +; CHECK-NOFP-NEXT: vstr.16 s0, [r0] +; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half %y, <8 x half> %x) @@ -116,18 +196,12 @@ ; CHECK-FP-LABEL: fadd_v16f16: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vadd.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmovx.f16 s4, s0 -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vadd.f16 s4, s0, s4 -; CHECK-FP-NEXT: vmovx.f16 s0, s3 -; CHECK-FP-NEXT: vadd.f16 s4, s4, s1 -; CHECK-FP-NEXT: vadd.f16 s4, s4, s6 -; CHECK-FP-NEXT: vmovx.f16 s6, s2 -; CHECK-FP-NEXT: vadd.f16 s4, s4, s2 +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vadd.f16 q0, q0, q1 +; CHECK-FP-NEXT: vadd.f16 s4, s2, s3 +; CHECK-FP-NEXT: vadd.f16 s0, s0, s1 ; CHECK-FP-NEXT: vldr.16 s2, [r0] -; CHECK-FP-NEXT: vadd.f16 s4, s4, s6 -; CHECK-FP-NEXT: vadd.f16 s4, s4, s3 -; CHECK-FP-NEXT: vadd.f16 s0, s4, s0 +; CHECK-FP-NEXT: vadd.f16 s0, s0, s4 ; CHECK-FP-NEXT: vadd.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr @@ -365,5 +439,6 @@ declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>) declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>) +declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half, <2 x half>) declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>) declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll @@ -4,29 +4,47 @@ ; FIXME minnum nonan X, +Inf -> X ? define arm_aapcs_vfpcc float @fmin_v2f32(<2 x float> %x) { -; CHECK-LABEL: fmin_v2f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr s4, .LCPI0_0 -; CHECK-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-NEXT: vminnm.f32 s0, s0, s4 -; CHECK-NEXT: vminnm.f32 s0, s0, s4 -; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0x7f800000 @ float +Inf +; CHECK-FP-LABEL: fmin_v2f32: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vldr s4, .LCPI0_0 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: bx lr +; CHECK-FP-NEXT: .p2align 2 +; CHECK-FP-NEXT: @ %bb.1: +; CHECK-FP-NEXT: .LCPI0_0: +; CHECK-FP-NEXT: .long 0x7f800000 @ float +Inf +; +; CHECK-NOFP-LABEL: fmin_v2f32: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vldr s4, .LCPI0_0 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 2 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI0_0: +; CHECK-NOFP-NEXT: .long 0x7f800000 @ float +Inf entry: %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) ret float %z } define arm_aapcs_vfpcc float @fmin_v4f32(<4 x float> %x) { -; CHECK-LABEL: fmin_v4f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vminnm.f32 s4, s0, s1 -; CHECK-NEXT: vminnm.f32 s4, s4, s2 -; CHECK-NEXT: vminnm.f32 s0, s4, s3 -; CHECK-NEXT: bx lr +; CHECK-FP-LABEL: fmin_v4f32: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmin_v4f32: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vminnm.f32 s4, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s4, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3 +; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x) ret float %z @@ -37,9 +55,9 @@ ; CHECK-FP-LABEL: fmin_v8f32: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f32 s4, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s4, s4, s2 -; CHECK-FP-NEXT: vminnm.f32 s0, s4, s3 +; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f32: @@ -66,43 +84,61 @@ } define arm_aapcs_vfpcc half @fmin_v4f16(<4 x half> %x) { -; CHECK-LABEL: fmin_v4f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NEXT: vldr.16 s2, .LCPI3_0 -; CHECK-NEXT: vminnm.f16 s0, s4, s0 -; CHECK-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 1 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .short 0x7c00 @ half +Inf +; CHECK-FP-LABEL: fmin_v4f16: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmin_v4f16: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI3_0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 1 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI3_0: +; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf entry: %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) ret half %z } define arm_aapcs_vfpcc half @fmin_v8f16(<8 x half> %x) { -; CHECK-LABEL: fmin_v8f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vminnm.f16 s4, s4, s2 -; CHECK-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-NEXT: vminnm.f16 s4, s4, s3 -; CHECK-NEXT: vminnm.f16 s0, s4, s0 -; CHECK-NEXT: bx lr +; CHECK-FP-LABEL: fmin_v8f16: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmin_v8f16: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x) ret half %z @@ -112,17 +148,11 @@ ; CHECK-FP-LABEL: fmin_v16f16: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmovx.f16 s4, s0 -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-FP-NEXT: vmovx.f16 s0, s3 -; CHECK-FP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-FP-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-FP-NEXT: vmovx.f16 s6, s2 -; CHECK-FP-NEXT: vminnm.f16 s4, s4, s2 -; CHECK-FP-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-FP-NEXT: vminnm.f16 s4, s4, s3 -; CHECK-FP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16: @@ -502,18 +532,30 @@ } define arm_aapcs_vfpcc float @fmin_v2f32_acc(<2 x float> %x, float %y) { -; CHECK-LABEL: fmin_v2f32_acc: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr s6, .LCPI18_0 -; CHECK-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-NEXT: vminnm.f32 s0, s0, s6 -; CHECK-NEXT: vminnm.f32 s0, s0, s6 -; CHECK-NEXT: vminnm.f32 s0, s4, s0 -; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI18_0: -; CHECK-NEXT: .long 0x7f800000 @ float +Inf +; CHECK-FP-LABEL: fmin_v2f32_acc: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vldr s6, .LCPI18_0 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f32 s0, s4, s0 +; CHECK-FP-NEXT: bx lr +; CHECK-FP-NEXT: .p2align 2 +; CHECK-FP-NEXT: @ %bb.1: +; CHECK-FP-NEXT: .LCPI18_0: +; CHECK-FP-NEXT: .long 0x7f800000 @ float +Inf +; +; CHECK-NOFP-LABEL: fmin_v2f32_acc: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vldr s6, .LCPI18_0 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s0 +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 2 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI18_0: +; CHECK-NOFP-NEXT: .long 0x7f800000 @ float +Inf entry: %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) %c = fcmp fast olt float %y, %z @@ -522,13 +564,21 @@ } define arm_aapcs_vfpcc float @fmin_v4f32_acc(<4 x float> %x, float %y) { -; CHECK-LABEL: fmin_v4f32_acc: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vminnm.f32 s6, s0, s1 -; CHECK-NEXT: vminnm.f32 s6, s6, s2 -; CHECK-NEXT: vminnm.f32 s0, s6, s3 -; CHECK-NEXT: vminnm.f32 s0, s4, s0 -; CHECK-NEXT: bx lr +; CHECK-FP-LABEL: fmin_v4f32_acc: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vminnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f32 s0, s4, s0 +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmin_v4f32_acc: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vminnm.f32 s6, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s6, s6, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s0 +; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x) %c = fcmp fast olt float %y, %z @@ -540,9 +590,9 @@ ; CHECK-FP-LABEL: fmin_v8f32_acc: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f32 s4, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s4, s4, s2 -; CHECK-FP-NEXT: vminnm.f32 s0, s4, s3 +; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: vminnm.f32 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; @@ -573,26 +623,38 @@ } define arm_aapcs_vfpcc void @fmin_v4f16_acc(<4 x half> %x, half* %yy) { -; CHECK-LABEL: fmin_v4f16_acc: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NEXT: vldr.16 s2, .LCPI21_0 -; CHECK-NEXT: vminnm.f16 s0, s4, s0 -; CHECK-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NEXT: vminnm.f16 s0, s0, s2 -; CHECK-NEXT: vldr.16 s2, [r0] -; CHECK-NEXT: vminnm.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] -; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 1 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI21_0: -; CHECK-NEXT: .short 0x7c00 @ half +Inf +; CHECK-FP-LABEL: fmin_v4f16_acc: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmin_v4f16_acc: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI21_0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-NOFP-NEXT: vstr.16 s0, [r0] +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 1 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI21_0: +; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) @@ -602,24 +664,74 @@ ret void } +define arm_aapcs_vfpcc void @fmin_v2f16_acc(<2 x half> %x, half* %yy) { +; CHECK-FP-LABEL: fmin_v2f16_acc: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vmovx.f16 s4, s0 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmin_v2f16_acc: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI22_0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-NOFP-NEXT: vstr.16 s0, [r0] +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 1 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI22_0: +; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf +entry: + %y = load half, half* %yy + %z = call fast half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %x) + %c = fcmp fast olt half %y, %z + %r = select i1 %c, half %y, half %z + store half %r, half* %yy + ret void +} + define arm_aapcs_vfpcc void @fmin_v8f16_acc(<8 x half> %x, half* %yy) { -; CHECK-LABEL: fmin_v8f16_acc: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vminnm.f16 s4, s4, s2 -; CHECK-NEXT: vldr.16 s2, [r0] -; CHECK-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-NEXT: vminnm.f16 s4, s4, s3 -; CHECK-NEXT: vminnm.f16 s0, s4, s0 -; CHECK-NEXT: vminnm.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] -; CHECK-NEXT: bx lr +; CHECK-FP-LABEL: fmin_v8f16_acc: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmin_v8f16_acc: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2 +; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-NOFP-NEXT: vstr.16 s0, [r0] +; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x) @@ -633,18 +745,12 @@ ; CHECK-FP-LABEL: fmin_v16f16_acc: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmovx.f16 s4, s0 -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-FP-NEXT: vmovx.f16 s0, s3 -; CHECK-FP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-FP-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-FP-NEXT: vmovx.f16 s6, s2 -; CHECK-FP-NEXT: vminnm.f16 s4, s4, s2 +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 ; CHECK-FP-NEXT: vldr.16 s2, [r0] -; CHECK-FP-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-FP-NEXT: vminnm.f16 s4, s4, s3 -; CHECK-FP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: vminnm.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr @@ -1115,29 +1221,47 @@ } define arm_aapcs_vfpcc float @fmax_v2f32(<2 x float> %x) { -; CHECK-LABEL: fmax_v2f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr s4, .LCPI36_0 -; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 -; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 -; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI36_0: -; CHECK-NEXT: .long 0xff800000 @ float -Inf +; CHECK-FP-LABEL: fmax_v2f32: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vldr s4, .LCPI37_0 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: bx lr +; CHECK-FP-NEXT: .p2align 2 +; CHECK-FP-NEXT: @ %bb.1: +; CHECK-FP-NEXT: .LCPI37_0: +; CHECK-FP-NEXT: .long 0xff800000 @ float -Inf +; +; CHECK-NOFP-LABEL: fmax_v2f32: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vldr s4, .LCPI37_0 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 2 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI37_0: +; CHECK-NOFP-NEXT: .long 0xff800000 @ float -Inf entry: %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) ret float %z } define arm_aapcs_vfpcc float @fmax_v4f32(<4 x float> %x) { -; CHECK-LABEL: fmax_v4f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmaxnm.f32 s4, s0, s1 -; CHECK-NEXT: vmaxnm.f32 s4, s4, s2 -; CHECK-NEXT: vmaxnm.f32 s0, s4, s3 -; CHECK-NEXT: bx lr +; CHECK-FP-LABEL: fmax_v4f32: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmax_v4f32: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3 +; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x) ret float %z @@ -1147,9 +1271,9 @@ ; CHECK-FP-LABEL: fmax_v8f32: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f32 s4, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s4, s4, s2 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f32: @@ -1176,43 +1300,61 @@ } define arm_aapcs_vfpcc half @fmax_v4f16(<4 x half> %x) { -; CHECK-LABEL: fmax_v4f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NEXT: vldr.16 s2, .LCPI39_0 -; CHECK-NEXT: vmaxnm.f16 s0, s4, s0 -; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 1 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI39_0: -; CHECK-NEXT: .short 0xfc00 @ half -Inf +; CHECK-FP-LABEL: fmax_v4f16: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmax_v4f16: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI40_0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 1 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI40_0: +; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf entry: %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) ret half %z } define arm_aapcs_vfpcc half @fmax_v8f16(<8 x half> %x) { -; CHECK-LABEL: fmax_v8f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vmaxnm.f16 s4, s4, s2 -; CHECK-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-NEXT: vmaxnm.f16 s4, s4, s3 -; CHECK-NEXT: vmaxnm.f16 s0, s4, s0 -; CHECK-NEXT: bx lr +; CHECK-FP-LABEL: fmax_v8f16: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmax_v8f16: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x) ret half %z @@ -1222,17 +1364,11 @@ ; CHECK-FP-LABEL: fmax_v16f16: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmovx.f16 s4, s0 -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-FP-NEXT: vmovx.f16 s0, s3 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-FP-NEXT: vmovx.f16 s6, s2 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s2 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s3 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16: @@ -1610,18 +1746,30 @@ } define arm_aapcs_vfpcc float @fmax_v2f32_acc(<2 x float> %x, float %y) { -; CHECK-LABEL: fmax_v2f32_acc: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr s6, .LCPI54_0 -; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-NEXT: vmaxnm.f32 s0, s0, s6 -; CHECK-NEXT: vmaxnm.f32 s0, s0, s6 -; CHECK-NEXT: vmaxnm.f32 s0, s4, s0 -; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI54_0: -; CHECK-NEXT: .long 0xff800000 @ float -Inf +; CHECK-FP-LABEL: fmax_v2f32_acc: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vldr s6, .LCPI55_0 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s0 +; CHECK-FP-NEXT: bx lr +; CHECK-FP-NEXT: .p2align 2 +; CHECK-FP-NEXT: @ %bb.1: +; CHECK-FP-NEXT: .LCPI55_0: +; CHECK-FP-NEXT: .long 0xff800000 @ float -Inf +; +; CHECK-NOFP-LABEL: fmax_v2f32_acc: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vldr s6, .LCPI55_0 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s0 +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 2 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI55_0: +; CHECK-NOFP-NEXT: .long 0xff800000 @ float -Inf entry: %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) %c = fcmp fast ogt float %y, %z @@ -1630,13 +1778,21 @@ } define arm_aapcs_vfpcc float @fmax_v4f32_acc(<4 x float> %x, float %y) { -; CHECK-LABEL: fmax_v4f32_acc: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmaxnm.f32 s6, s0, s1 -; CHECK-NEXT: vmaxnm.f32 s6, s6, s2 -; CHECK-NEXT: vmaxnm.f32 s0, s6, s3 -; CHECK-NEXT: vmaxnm.f32 s0, s4, s0 -; CHECK-NEXT: bx lr +; CHECK-FP-LABEL: fmax_v4f32_acc: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vmaxnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s0 +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmax_v4f32_acc: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s6, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s0 +; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x) %c = fcmp fast ogt float %y, %z @@ -1648,9 +1804,9 @@ ; CHECK-FP-LABEL: fmax_v8f32_acc: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f32 s4, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s4, s4, s2 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; @@ -1680,27 +1836,77 @@ ret float %r } +define arm_aapcs_vfpcc void @fmax_v2f16_acc(<2 x half> %x, half* %yy) { +; CHECK-FP-LABEL: fmax_v2f16_acc: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vmovx.f16 s4, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmax_v2f16_acc: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI58_0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NOFP-NEXT: vstr.16 s0, [r0] +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 1 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI58_0: +; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf +entry: + %y = load half, half* %yy + %z = call fast half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %x) + %c = fcmp fast ogt half %y, %z + %r = select i1 %c, half %y, half %z + store half %r, half* %yy + ret void +} + define arm_aapcs_vfpcc void @fmax_v4f16_acc(<4 x half> %x, half* %yy) { -; CHECK-LABEL: fmax_v4f16_acc: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NEXT: vldr.16 s2, .LCPI57_0 -; CHECK-NEXT: vmaxnm.f16 s0, s4, s0 -; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 -; CHECK-NEXT: vldr.16 s2, [r0] -; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] -; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 1 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI57_0: -; CHECK-NEXT: .short 0xfc00 @ half -Inf +; CHECK-FP-LABEL: fmax_v4f16_acc: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmax_v4f16_acc: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI59_0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NOFP-NEXT: vstr.16 s0, [r0] +; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 1 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI59_0: +; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) @@ -1711,23 +1917,35 @@ } define arm_aapcs_vfpcc void @fmax_v8f16_acc(<8 x half> %x, half* %yy) { -; CHECK-LABEL: fmax_v8f16_acc: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vmaxnm.f16 s4, s4, s2 -; CHECK-NEXT: vldr.16 s2, [r0] -; CHECK-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-NEXT: vmaxnm.f16 s4, s4, s3 -; CHECK-NEXT: vmaxnm.f16 s0, s4, s0 -; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] -; CHECK-NEXT: bx lr +; CHECK-FP-LABEL: fmax_v8f16_acc: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmax_v8f16_acc: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2 +; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NOFP-NEXT: vstr.16 s0, [r0] +; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x) @@ -1741,18 +1959,12 @@ ; CHECK-FP-LABEL: fmax_v16f16_acc: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmovx.f16 s4, s0 -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-FP-NEXT: vmovx.f16 s0, s3 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-FP-NEXT: vmovx.f16 s6, s2 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s2 +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 ; CHECK-FP-NEXT: vldr.16 s2, [r0] -; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s4, s3 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr @@ -2235,8 +2447,10 @@ declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>) declare half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half>) +declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>) declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>) declare half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half>) declare half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half>) +declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>) declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>) declare half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll @@ -14,13 +14,21 @@ } define arm_aapcs_vfpcc float @fmul_v4f32(<4 x float> %x, float %y) { -; CHECK-LABEL: fmul_v4f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f32 s6, s0, s1 -; CHECK-NEXT: vmul.f32 s6, s6, s2 -; CHECK-NEXT: vmul.f32 s0, s6, s3 -; CHECK-NEXT: vmul.f32 s0, s4, s0 -; CHECK-NEXT: bx lr +; CHECK-FP-LABEL: fmul_v4f32: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vmul.f32 s6, s2, s3 +; CHECK-FP-NEXT: vmul.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmul.f32 s0, s0, s6 +; CHECK-FP-NEXT: vmul.f32 s0, s4, s0 +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmul_v4f32: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmul.f32 s6, s0, s1 +; CHECK-NOFP-NEXT: vmul.f32 s6, s6, s2 +; CHECK-NOFP-NEXT: vmul.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vmul.f32 s0, s4, s0 +; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %y, <4 x float> %x) ret float %z @@ -30,9 +38,9 @@ ; CHECK-FP-LABEL: fmul_v8f32: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmul.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmul.f32 s4, s0, s1 -; CHECK-FP-NEXT: vmul.f32 s4, s4, s2 -; CHECK-FP-NEXT: vmul.f32 s0, s4, s3 +; CHECK-FP-NEXT: vmul.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmul.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmul.f32 s0, s0, s4 ; CHECK-FP-NEXT: vmul.f32 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; @@ -52,43 +60,83 @@ ret float %z } -define arm_aapcs_vfpcc void @fmul_v4f16(<4 x half> %x, half* %yy) { -; CHECK-LABEL: fmul_v4f16: +define arm_aapcs_vfpcc void @fmul_v2f16(<2 x half> %x, half* %yy) { +; CHECK-LABEL: fmul_v2f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmul.f16 s4, s0, s4 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmul.f16 s4, s4, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s4 ; CHECK-NEXT: vldr.16 s2, [r0] -; CHECK-NEXT: vmul.f16 s0, s4, s0 ; CHECK-NEXT: vmul.f16 s0, s2, s0 ; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: bx lr entry: %y = load half, half* %yy + %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half %y, <2 x half> %x) + store half %z, half* %yy + ret void +} + +define arm_aapcs_vfpcc void @fmul_v4f16(<4 x half> %x, half* %yy) { +; CHECK-FP-LABEL: fmul_v4f16: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmul.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmul.f16 s4, s1, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmul.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmul.f16 s0, s2, s0 +; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmul_v4f16: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmul.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vmul.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmul.f16 s0, s2, s0 +; CHECK-NOFP-NEXT: vstr.16 s0, [r0] +; CHECK-NOFP-NEXT: bx lr +entry: + %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half %y, <4 x half> %x) store half %z, half* %yy ret void } define arm_aapcs_vfpcc void @fmul_v8f16(<8 x half> %x, half* %yy) { -; CHECK-LABEL: fmul_v8f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmul.f16 s4, s0, s4 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmul.f16 s4, s4, s1 -; CHECK-NEXT: vmul.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vmul.f16 s4, s4, s2 -; CHECK-NEXT: vldr.16 s2, [r0] -; CHECK-NEXT: vmul.f16 s4, s4, s6 -; CHECK-NEXT: vmul.f16 s4, s4, s3 -; CHECK-NEXT: vmul.f16 s0, s4, s0 -; CHECK-NEXT: vmul.f16 s0, s2, s0 -; CHECK-NEXT: vstr.16 s0, [r0] -; CHECK-NEXT: bx lr +; CHECK-FP-LABEL: fmul_v8f16: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vmul.f16 q0, q0, q1 +; CHECK-FP-NEXT: vmul.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmul.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmul.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmul.f16 s0, s2, s0 +; CHECK-FP-NEXT: vstr.16 s0, [r0] +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmul_v8f16: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmul.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmul.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vmul.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vmul.f16 s4, s4, s2 +; CHECK-NOFP-NEXT: vldr.16 s2, [r0] +; CHECK-NOFP-NEXT: vmul.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmul.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmul.f16 s0, s2, s0 +; CHECK-NOFP-NEXT: vstr.16 s0, [r0] +; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half %y, <8 x half> %x) @@ -100,18 +148,12 @@ ; CHECK-FP-LABEL: fmul_v16f16: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmul.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmovx.f16 s4, s0 -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmul.f16 s4, s0, s4 -; CHECK-FP-NEXT: vmovx.f16 s0, s3 -; CHECK-FP-NEXT: vmul.f16 s4, s4, s1 -; CHECK-FP-NEXT: vmul.f16 s4, s4, s6 -; CHECK-FP-NEXT: vmovx.f16 s6, s2 -; CHECK-FP-NEXT: vmul.f16 s4, s4, s2 +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vmul.f16 q0, q0, q1 +; CHECK-FP-NEXT: vmul.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmul.f16 s0, s0, s1 ; CHECK-FP-NEXT: vldr.16 s2, [r0] -; CHECK-FP-NEXT: vmul.f16 s4, s4, s6 -; CHECK-FP-NEXT: vmul.f16 s4, s4, s3 -; CHECK-FP-NEXT: vmul.f16 s0, s4, s0 +; CHECK-FP-NEXT: vmul.f16 s0, s0, s4 ; CHECK-FP-NEXT: vmul.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr @@ -227,6 +269,22 @@ ret float %z } +define arm_aapcs_vfpcc void @fmul_v2f16_nofast(<2 x half> %x, half* %yy) { +; CHECK-LABEL: fmul_v2f16_nofast: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s4, [r0] +; CHECK-NEXT: vmul.f16 s4, s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmul.f16 s0, s4, s0 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: bx lr +entry: + %y = load half, half* %yy + %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half %y, <2 x half> %x) + store half %z, half* %yy + ret void +} + define arm_aapcs_vfpcc void @fmul_v4f16_nofast(<4 x half> %x, half* %yy) { ; CHECK-LABEL: fmul_v4f16_nofast: ; CHECK: @ %bb.0: @ %entry @@ -349,5 +407,6 @@ declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>) declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v16f16(half, <16 x half>) +declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half, <2 x half>) declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half, <4 x half>) declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half, <8 x half>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -122,14 +122,14 @@ ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: cmp r12, r1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: mul r2, r3, r2 ; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: mul lr, r3, r2 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: mul r2, r3, r2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: mul r2, r3, r2 +; CHECK-NEXT: mul r2, r2, lr ; CHECK-NEXT: beq .LBB1_8 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r12 @@ -222,13 +222,13 @@ ; CHECK-NEXT: vand q0, q1, q0 ; CHECK-NEXT: le lr, .LBB2_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov r12, s1 +; CHECK-NEXT: vmov r12, s3 ; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: and.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov lr, s1 ; CHECK-NEXT: and.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: and.w r2, r2, lr ; CHECK-NEXT: and.w r2, r2, r12 ; CHECK-NEXT: beq .LBB2_9 ; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1 @@ -322,13 +322,13 @@ ; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: le lr, .LBB3_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov r12, s1 +; CHECK-NEXT: vmov r12, s3 ; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: orr.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov lr, s1 ; CHECK-NEXT: orr.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: orr.w r2, r2, lr ; CHECK-NEXT: orr.w r2, r2, r12 ; CHECK-NEXT: beq .LBB3_9 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1 @@ -422,13 +422,13 @@ ; CHECK-NEXT: veor q0, q1, q0 ; CHECK-NEXT: le lr, .LBB4_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov r12, s1 +; CHECK-NEXT: vmov r12, s3 ; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: eor.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov lr, s1 ; CHECK-NEXT: eor.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: eor.w r2, r2, lr ; CHECK-NEXT: eor.w r2, r2, r12 ; CHECK-NEXT: beq .LBB4_9 ; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1 @@ -522,10 +522,10 @@ ; CHECK-NEXT: vadd.f32 q0, q1, q0 ; CHECK-NEXT: le lr, .LBB5_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vadd.f32 s4, s0, s1 +; CHECK-NEXT: vadd.f32 s4, s2, s3 ; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: vadd.f32 s4, s4, s2 -; CHECK-NEXT: vadd.f32 s0, s4, s3 +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vadd.f32 s0, s0, s4 ; CHECK-NEXT: beq .LBB5_9 ; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r2 @@ -623,10 +623,10 @@ ; CHECK-NEXT: vmul.f32 q0, q1, q0 ; CHECK-NEXT: le lr, .LBB6_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmul.f32 s4, s0, s1 +; CHECK-NEXT: vmul.f32 s4, s2, s3 ; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: vmul.f32 s4, s4, s2 -; CHECK-NEXT: vmul.f32 s0, s4, s3 +; CHECK-NEXT: vmul.f32 s0, s0, s1 +; CHECK-NEXT: vmul.f32 s0, s0, s4 ; CHECK-NEXT: beq .LBB6_9 ; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll @@ -16,12 +16,12 @@ define arm_aapcs_vfpcc i32 @mul_v4i32(<4 x i32> %x) { ; CHECK-LABEL: mul_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: @@ -33,12 +33,12 @@ ; CHECK-LABEL: mul_v8i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmul.i32 q0, q0, q1 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: @@ -49,12 +49,12 @@ define arm_aapcs_vfpcc i16 @mul_v4i16(<4 x i16> %x) { ; CHECK-LABEL: mul_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: @@ -65,20 +65,14 @@ define arm_aapcs_vfpcc i16 @mul_v8i16(<8 x i16> %x) { ; CHECK-LABEL: mul_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: @@ -90,20 +84,14 @@ ; CHECK-LABEL: mul_v16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmul.i16 q0, q0, q1 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: @@ -114,20 +102,14 @@ define arm_aapcs_vfpcc i8 @mul_v8i8(<8 x i8> %x) { ; CHECK-LABEL: mul_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: @@ -138,36 +120,16 @@ define arm_aapcs_vfpcc i8 @mul_v16i8(<16 x i8> %x) { ; CHECK-LABEL: mul_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: vmul.i8 q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: vmul.i8 q0, q0, q1 +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmov.u8 r1, q0[8] ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: @@ -179,36 +141,16 @@ ; CHECK-LABEL: mul_v32i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmul.i8 q0, q0, q1 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: vmul.i8 q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: vmul.i8 q0, q0, q1 +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmov.u8 r1, q0[8] ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: @@ -286,12 +228,12 @@ define arm_aapcs_vfpcc i32 @mul_v4i32_acc(<4 x i32> %x, i32 %y) { ; CHECK-LABEL: mul_v4i32_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr @@ -305,12 +247,12 @@ ; CHECK-LABEL: mul_v8i32_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmul.i32 q0, q0, q1 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr @@ -323,12 +265,12 @@ define arm_aapcs_vfpcc i16 @mul_v4i16_acc(<4 x i16> %x, i16 %y) { ; CHECK-LABEL: mul_v4i16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr @@ -341,20 +283,14 @@ define arm_aapcs_vfpcc i16 @mul_v8i16_acc(<8 x i16> %x, i16 %y) { ; CHECK-LABEL: mul_v8i16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr @@ -368,20 +304,14 @@ ; CHECK-LABEL: mul_v16i16_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmul.i16 q0, q0, q1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr @@ -394,20 +324,14 @@ define arm_aapcs_vfpcc i8 @mul_v8i8_acc(<8 x i8> %x, i8 %y) { ; CHECK-LABEL: mul_v8i8_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: vrev32.16 q1, q0 +; CHECK-NEXT: vmul.i16 q0, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q0[6] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr @@ -420,36 +344,16 @@ define arm_aapcs_vfpcc i8 @mul_v16i8_acc(<16 x i8> %x, i8 %y) { ; CHECK-LABEL: mul_v16i8_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: vmul.i8 q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: vmul.i8 q0, q0, q1 +; CHECK-NEXT: vmov.u8 r1, q0[12] ; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr @@ -463,36 +367,16 @@ ; CHECK-LABEL: mul_v32i8_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmul.i8 q0, q0, q1 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: vrev16.8 q1, q0 +; CHECK-NEXT: vmul.i8 q0, q0, q1 +; CHECK-NEXT: vrev32.8 q1, q0 +; CHECK-NEXT: vmul.i8 q0, q0, q1 +; CHECK-NEXT: vmov.u8 r1, q0[12] ; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr