diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -206,6 +206,10 @@ VMULLs, // ...signed VMULLu, // ...unsigned + // MVE reductions + VADDVs, + VADDVu, + SMULWB, // Signed multiply word by half word, bottom SMULWT, // Signed multiply word by half word, top UMLAL, // 64bit Unsigned Accumulate Multiply diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -942,6 +942,7 @@ setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::VECREDUCE_ADD); } if (!Subtarget->hasFP64()) { @@ -1653,6 +1654,8 @@ case ARMISD::VMOVN: return "ARMISD::VMOVN"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::VADDVs: return "ARMISD::VADDVs"; + case ARMISD::VADDVu: return "ARMISD::VADDVu"; case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; @@ -13926,6 +13929,40 @@ ConvInput, DAG.getConstant(C, dl, MVT::i32)); } +static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (!ST->hasMVEIntegerOps()) + return SDValue(); + + assert(N->getOpcode() == ISD::VECREDUCE_ADD); + EVT ResVT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDLoc dl(N); + + // We are looking for something that will have illegal types if left alone, + // but that we can convert to a single instruction undef MVE. For example + // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A + + // Cases: + // VADDV u/s 8/16/32 + + auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef ExtTypes) { + if (ResVT != RetTy || N0->getOpcode() != ExtendCode) + return SDValue(); + SDValue A = N0->getOperand(0); + if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) + return A; + return SDValue(); + }; + + if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8})) + return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A); + if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8})) + return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A); + + return SDValue(); +} + /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); @@ -14818,6 +14855,8 @@ return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget); case ARMISD::VCMP: return PerformVCMPCombine(N, DCI, Subtarget); + case ISD::VECREDUCE_ADD: + return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget); case ARMISD::SMULWB: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -620,17 +620,41 @@ defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>; defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>; +def ARMVADDVs : SDNode<"ARMISD::VADDVs", SDTVecReduce>; +def ARMVADDVu : SDNode<"ARMISD::VADDVu", SDTVecReduce>; + let Predicates = [HasMVEInt] in { - def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>; - def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>; - def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPR:$src2))), + def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), + (i32 (MVE_VADDVu32no_acc $src))>; + def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), + (i32 (MVE_VADDVu16no_acc $src))>; + def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), + (i32 (MVE_VADDVu8no_acc $src))>; + + def : Pat<(i32 (ARMVADDVs (v8i16 MQPR:$src))), + (i32 (MVE_VADDVs16no_acc $src))>; + def : Pat<(i32 (ARMVADDVu (v8i16 MQPR:$src))), + (i32 (MVE_VADDVu16no_acc $src))>; + def : Pat<(i32 (ARMVADDVs (v16i8 MQPR:$src))), + (i32 (MVE_VADDVs8no_acc $src))>; + def : Pat<(i32 (ARMVADDVu (v16i8 MQPR:$src))), + (i32 (MVE_VADDVu8no_acc $src))>; + + def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPREven:$src2))), (i32 (MVE_VADDVu32acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPR:$src2))), + def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPREven:$src2))), (i32 (MVE_VADDVu16acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPR:$src2))), + def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPREven:$src2))), (i32 (MVE_VADDVu8acc $src2, $src1))>; + def : Pat<(i32 (add (i32 (ARMVADDVs (v8i16 MQPR:$src1))), (i32 tGPREven:$src2))), + (i32 (MVE_VADDVs16acc $src2, $src1))>; + def : Pat<(i32 (add (i32 (ARMVADDVu (v8i16 MQPR:$src1))), (i32 tGPREven:$src2))), + (i32 (MVE_VADDVu16acc $src2, $src1))>; + def : Pat<(i32 (add (i32 (ARMVADDVs (v16i8 MQPR:$src1))), (i32 tGPREven:$src2))), + (i32 (MVE_VADDVs8acc $src2, $src1))>; + def : Pat<(i32 (add (i32 (ARMVADDVu (v16i8 MQPR:$src1))), (i32 tGPREven:$src2))), + (i32 (MVE_VADDVu8acc $src2, $src1))>; } class MVE_VADDLV %x) { ; CHECK-LABEL: add_v8i16_v8i32_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmovlb.u16 q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i16> %x to <8 x i32> @@ -160,26 +141,7 @@ define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x) { ; CHECK-LABEL: add_v8i16_v8i32_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vaddv.s16 r0, q0 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i16> %x to <8 x i32> @@ -397,50 +359,7 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x) { ; CHECK-LABEL: add_v16i8_v16i32_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vand q0, q4, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q3 -; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i32> @@ -451,50 +370,7 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x) { ; CHECK-LABEL: add_v16i8_v16i32_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q2, q2 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.32 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmovlb.s8 q2, q2 -; CHECK-NEXT: vmovlb.s8 q0, q3 -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vaddv.s8 r0, q0 ; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i32> @@ -1114,26 +990,7 @@ define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) { ; CHECK-LABEL: add_v8i16_v8i32_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmovlb.u16 q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i16> %x to <8 x i32> @@ -1145,26 +1002,7 @@ define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) { ; CHECK-LABEL: add_v8i16_v8i32_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vaddva.s16 r0, q0 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i16> %x to <8 x i32> @@ -1402,50 +1240,7 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) { ; CHECK-LABEL: add_v16i8_v16i32_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov.32 q3[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.32 q3[3], r1 -; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: vmov.32 q3[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: vmov.32 q3[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmov.32 q4[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.32 q4[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov.32 q4[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.32 q4[3], r1 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vand q0, q4, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q3 -; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i32> @@ -1457,50 +1252,7 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) { ; CHECK-LABEL: add_v16i8_v16i32_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q2, q2 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.32 q3[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.32 q3[3], r1 -; CHECK-NEXT: vmovlb.s8 q2, q2 -; CHECK-NEXT: vmovlb.s8 q0, q3 -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vaddva.s8 r0, q0 ; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i32>