Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -283,6 +283,10 @@ setOperationAction(ISD::USUBSAT, VT, Legal); setOperationAction(ISD::ABDS, VT, Legal); setOperationAction(ISD::ABDU, VT, Legal); + setOperationAction(ISD::HADDS, VT, Legal); + setOperationAction(ISD::HADDU, VT, Legal); + setOperationAction(ISD::RHADDS, VT, Legal); + setOperationAction(ISD::RHADDU, VT, Legal); // No native support for these. setOperationAction(ISD::UDIV, VT, Expand); @@ -14879,6 +14883,10 @@ case ISD::MULHU: case ISD::ABDS: case ISD::ABDU: + case ISD::HADDS: + case ISD::HADDU: + case ISD::RHADDS: + case ISD::RHADDU: break; default: return SDValue(); Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2189,36 +2189,29 @@ return N->getFlags().hasNoSignedWrap(); }]>; -multiclass MVE_VRHADD_m { +multiclass MVE_VRHADD_m { def "" : MVE_VRHADD_Base; defvar Inst = !cast(NAME); + defm : MVE_TwoOpPattern(NAME)>; let Predicates = [HasMVEInt] in { - // Unpredicated rounding add-with-divide-by-two + // Unpredicated rounding add-with-divide-by-two intrinsic def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; - - // Predicated add-with-divide-by-two - def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$inactive))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - ARMVCCThen, (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$inactive)))>; } } -multiclass MVE_VRHADD - : MVE_VRHADD_m; +multiclass MVE_VRHADD + : MVE_VRHADD_m; -defm MVE_VRHADDs8 : MVE_VRHADD; -defm MVE_VRHADDs16 : MVE_VRHADD; -defm MVE_VRHADDs32 : MVE_VRHADD; -defm MVE_VRHADDu8 : MVE_VRHADD; -defm MVE_VRHADDu16 : MVE_VRHADD; -defm MVE_VRHADDu32 : MVE_VRHADD; +defm MVE_VRHADDs8 : MVE_VRHADD; +defm MVE_VRHADDs16 : MVE_VRHADD; +defm MVE_VRHADDs32 : MVE_VRHADD; +defm MVE_VRHADDu8 : MVE_VRHADD; +defm MVE_VRHADDu16 : MVE_VRHADD; +defm MVE_VRHADDu32 : MVE_VRHADD; // Rounding Halving Add perform the arithemtic operation with an extra bit of // precision, before performing the shift, to void clipping errors. We're not @@ -2275,11 +2268,12 @@ list pattern=[]> : MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>; -multiclass MVE_VHADD_m { def "" : MVE_VHADD_; defvar Inst = !cast(NAME); + defm : MVE_TwoOpPattern(NAME)>; let Predicates = [HasMVEInt] in { // Unpredicated add-and-divide-by-two @@ -2288,30 +2282,23 @@ def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))), (Inst MQPR:$Qm, MQPR:$Qn)>; - - // Predicated add-and-divide-by-two - def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned), - (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - ARMVCCThen, (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$inactive)))>; } } -multiclass MVE_VHADD - : MVE_VHADD_m + : MVE_VHADD_m; // Halving add/sub perform the arithemtic operation with an extra bit of // precision, before performing the shift, to void clipping errors. We're not // modelling that here with these patterns, but we're using no wrap forms of // add/sub to ensure that the extra bit of information is not needed. -defm MVE_VHADDs8 : MVE_VHADD; -defm MVE_VHADDs16 : MVE_VHADD; -defm MVE_VHADDs32 : MVE_VHADD; -defm MVE_VHADDu8 : MVE_VHADD; -defm MVE_VHADDu16 : MVE_VHADD; -defm MVE_VHADDu32 : MVE_VHADD; +defm MVE_VHADDs8 : MVE_VHADD; +defm MVE_VHADDs16 : MVE_VHADD; +defm MVE_VHADDs32 : MVE_VHADD; +defm MVE_VHADDu8 : MVE_VHADD; +defm MVE_VHADDu16 : MVE_VHADD; +defm MVE_VHADDu32 : MVE_VHADD; multiclass MVE_VHSUB_m { def "" : MVE_VxADDSUB_qr; + defm : MVE_TwoOpPatternDup(NAME)>; defm : MVE_vec_scalar_int_pat_m(NAME), VTI, unpred_int, pred_int, 1, 1>; } -multiclass MVE_VHADD_qr_m : - MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd, +multiclass MVE_VHADD_qr_m : + MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, Op, int_arm_mve_vhadd, int_arm_mve_hadd_predicated>; multiclass MVE_VHSUB_qr_m : - MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub, + MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, null_frag, int_arm_mve_vhsub, int_arm_mve_hsub_predicated>; -defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m; defm MVE_VHSUB_qr_s8 : MVE_VHSUB_qr_m; defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m; Index: llvm/test/CodeGen/Thumb2/mve-vhadd.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vhadd.ll +++ llvm/test/CodeGen/Thumb2/mve-vhadd.ll @@ -4,18 +4,7 @@ define arm_aapcs_vfpcc <16 x i8> @vrhadd_s8(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: vrhadd_s8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmovlt.s8 q2, q1 -; CHECK-NEXT: vmovlt.s8 q3, q0 -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vadd.i16 q2, q3, q2 -; CHECK-NEXT: vmov.i16 q3, #0x1 -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q2, q2, q3 -; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vshr.u16 q2, q2, #1 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vrhadd.s8 q0, q0, q1 ; CHECK-NEXT: bx lr %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> @@ -29,18 +18,7 @@ define arm_aapcs_vfpcc <8 x i16> @vrhadd_s16(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vrhadd_s16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmovlt.s16 q2, q1 -; CHECK-NEXT: vmovlt.s16 q3, q0 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vmov.i32 q3, #0x1 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vadd.i32 q0, q0, q3 -; CHECK-NEXT: vshr.u32 q2, q2, #1 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vrhadd.s16 q0, q0, q1 ; CHECK-NEXT: bx lr %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> %sextsrc2 = sext <8 x i16> %src2 to <8 x i32> @@ -54,50 +32,7 @@ define arm_aapcs_vfpcc <4 x i32> @vrhadd_s32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vrhadd_s32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s16, s6 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r2, asr #31 -; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adc.w r3, r2, r3, asr #31 -; CHECK-NEXT: adds r2, r1, #1 -; CHECK-NEXT: adc r1, r3, #0 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r2, asr #31 -; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adc.w r3, r2, r3, asr #31 -; CHECK-NEXT: adds r2, r1, #1 -; CHECK-NEXT: adc r1, r3, #0 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vrhadd.s32 q0, q0, q1 ; CHECK-NEXT: bx lr %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -111,15 +46,7 @@ define arm_aapcs_vfpcc <16 x i8> @vhadd_s8(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: vhadd_s8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmovlt.s8 q2, q1 -; CHECK-NEXT: vmovlt.s8 q3, q0 -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vadd.i16 q2, q3, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vshr.u16 q2, q2, #1 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vhadd.s8 q0, q0, q1 ; CHECK-NEXT: bx lr %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> @@ -132,15 +59,7 @@ define arm_aapcs_vfpcc <8 x i16> @vhadd_s16(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vhadd_s16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmovlt.s16 q2, q1 -; CHECK-NEXT: vmovlt.s16 q3, q0 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vshr.u32 q2, q2, #1 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vhadd.s16 q0, q0, q1 ; CHECK-NEXT: bx lr %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> %sextsrc2 = sext <8 x i16> %src2 to <8 x i32> @@ -153,42 +72,7 @@ define arm_aapcs_vfpcc <4 x i32> @vhadd_s32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vhadd_s32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s16, s6 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r2, asr #31 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: asr.w r12, r1, #31 -; CHECK-NEXT: adc.w r1, r12, r3, asr #31 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r2, asr #31 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: asr.w r12, r1, #31 -; CHECK-NEXT: adc.w r1, r12, r3, asr #31 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vhadd.s32 q0, q0, q1 ; CHECK-NEXT: bx lr %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -201,18 +85,7 @@ define arm_aapcs_vfpcc <16 x i8> @vrhadd_u8(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: vrhadd_u8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmovlt.u8 q2, q1 -; CHECK-NEXT: vmovlt.u8 q3, q0 -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vadd.i16 q2, q3, q2 -; CHECK-NEXT: vmov.i16 q3, #0x1 -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q2, q2, q3 -; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vshr.u16 q2, q2, #1 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vrhadd.u8 q0, q0, q1 ; CHECK-NEXT: bx lr %zextsrc1 = zext <16 x i8> %src1 to <16 x i16> %zextsrc2 = zext <16 x i8> %src2 to <16 x i16> @@ -226,18 +99,7 @@ define arm_aapcs_vfpcc <8 x i16> @vrhadd_u16(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vrhadd_u16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmovlt.u16 q2, q1 -; CHECK-NEXT: vmovlt.u16 q3, q0 -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vmov.i32 q3, #0x1 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vadd.i32 q0, q0, q3 -; CHECK-NEXT: vshr.u32 q2, q2, #1 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vrhadd.u16 q0, q0, q1 ; CHECK-NEXT: bx lr %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> %zextsrc2 = zext <8 x i16> %src2 to <8 x i32> @@ -251,50 +113,7 @@ define arm_aapcs_vfpcc <4 x i32> @vrhadd_u32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vrhadd_u32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.i64 q4, #0xffffffff -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vand q2, q2, q4 -; CHECK-NEXT: vand q3, q3, q4 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vand q4, q0, q4 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: vmov r3, r2, d8 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, r12, d2 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adc.w r3, r2, r12 -; CHECK-NEXT: adds r2, r1, #1 -; CHECK-NEXT: adc r1, r3, #0 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: vmov r3, r2, d9 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, r12, d3 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: adc.w r3, r2, r12 -; CHECK-NEXT: adds r2, r1, #1 -; CHECK-NEXT: adc r1, r3, #0 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vrhadd.u32 q0, q0, q1 ; CHECK-NEXT: bx lr %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> @@ -308,15 +127,7 @@ define arm_aapcs_vfpcc <16 x i8> @vhadd_u8(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: vhadd_u8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmovlt.u8 q2, q1 -; CHECK-NEXT: vmovlt.u8 q3, q0 -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vadd.i16 q2, q3, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vshr.u16 q2, q2, #1 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vhadd.u8 q0, q0, q1 ; CHECK-NEXT: bx lr %zextsrc1 = zext <16 x i8> %src1 to <16 x i16> %zextsrc2 = zext <16 x i8> %src2 to <16 x i16> @@ -329,15 +140,7 @@ define arm_aapcs_vfpcc <8 x i16> @vhadd_u16(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vhadd_u16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmovlt.u16 q2, q1 -; CHECK-NEXT: vmovlt.u16 q3, q0 -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vshr.u32 q2, q2, #1 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vhadd.u16 q0, q0, q1 ; CHECK-NEXT: bx lr %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> %zextsrc2 = zext <8 x i16> %src2 to <8 x i32> @@ -350,45 +153,8 @@ define arm_aapcs_vfpcc <4 x i32> @vhadd_u32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vhadd_u32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.i64 q4, #0xffffffff -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vand q2, q2, q4 -; CHECK-NEXT: vand q3, q3, q4 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vand q4, q0, q4 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r2, d8 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, r12, d2 -; CHECK-NEXT: adds r4, r3, r1 -; CHECK-NEXT: adc.w r1, r2, r12 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: lsrl r4, r1, #1 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, r4, d9 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, r12, d3 -; CHECK-NEXT: adds r2, r3, r1 -; CHECK-NEXT: adc.w r1, r4, r12 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vhadd.u32 q0, q0, q1 +; CHECK-NEXT: bx lr %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> %add = add <4 x i64> %zextsrc1, %zextsrc2 @@ -405,16 +171,10 @@ ; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: .LBB12_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.s16 q0, [r0, #8] -; CHECK-NEXT: vldrb.s16 q1, [r1, #8] -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vldrb.s16 q1, [r1], #16 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vstrb.16 q0, [r2, #8] -; CHECK-NEXT: vldrb.s16 q0, [r0], #16 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vshr.u16 q0, q0, #1 -; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: vldrb.u8 q0, [r0], #16 +; CHECK-NEXT: vldrb.u8 q1, [r1], #16 +; CHECK-NEXT: vhadd.s8 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB12_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -453,16 +213,10 @@ ; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: .LBB13_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.s32 q0, [r0, #8] -; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.s32 q1, [r1], #16 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrh.32 q0, [r2, #8] -; CHECK-NEXT: vldrh.s32 q0, [r0], #16 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrh.32 q0, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r1], #16 +; CHECK-NEXT: vhadd.s16 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB13_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -496,52 +250,18 @@ define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vhadd_loop_s32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #256 ; CHECK-NEXT: .LBB14_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: asr.w r4, r3, #31 -; CHECK-NEXT: adc.w r3, r4, r5, asr #31 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: adds r6, r3, r5 -; CHECK-NEXT: asr.w r4, r3, #31 -; CHECK-NEXT: adc.w r3, r4, r5, asr #31 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov q4[2], q4[0], r6, r12 -; CHECK-NEXT: adds r4, r3, r5 -; CHECK-NEXT: asr.w r6, r3, #31 -; CHECK-NEXT: adc.w r3, r6, r5, asr #31 -; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r6, r3, r5 -; CHECK-NEXT: asr.w r12, r3, #31 -; CHECK-NEXT: adc.w r3, r12, r5, asr #31 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q4[3], q4[1], r6, r4 -; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vhadd.s32 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB14_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r7, pc} entry: br label %vector.body @@ -577,14 +297,10 @@ ; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: .LBB15_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q0, [r0, #8] -; CHECK-NEXT: vldrb.u16 q1, [r1, #8] -; CHECK-NEXT: vhadd.u16 q0, q1, q0 -; CHECK-NEXT: vldrb.u16 q1, [r1], #16 -; CHECK-NEXT: vstrb.16 q0, [r2, #8] -; CHECK-NEXT: vldrb.u16 q0, [r0], #16 -; CHECK-NEXT: vhadd.u16 q0, q1, q0 -; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: vldrb.u8 q0, [r0], #16 +; CHECK-NEXT: vldrb.u8 q1, [r1], #16 +; CHECK-NEXT: vhadd.u8 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB15_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -623,14 +339,10 @@ ; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: .LBB16_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q0, [r0, #8] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vhadd.u32 q0, q1, q0 -; CHECK-NEXT: vldrh.u32 q1, [r1], #16 -; CHECK-NEXT: vstrh.32 q0, [r2, #8] -; CHECK-NEXT: vldrh.u32 q0, [r0], #16 -; CHECK-NEXT: vhadd.u32 q0, q1, q0 -; CHECK-NEXT: vstrh.32 q0, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r1], #16 +; CHECK-NEXT: vhadd.u16 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB16_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -664,56 +376,18 @@ define void @vhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vhadd_loop_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: .LBB17_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q3, [r0], #16 -; CHECK-NEXT: vldrw.u32 q4, [r1], #16 -; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s8, s18 -; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s10, s19 -; CHECK-NEXT: vand q1, q1, q0 -; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov r3, r5, d2 -; CHECK-NEXT: vmov r4, r6, d4 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vmov.f32 s18, s17 -; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vand q5, q4, q0 -; CHECK-NEXT: adds.w r12, r4, r3 -; CHECK-NEXT: adc.w r3, r6, r5 -; CHECK-NEXT: vmov r5, r6, d10 -; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, r7, d6 -; CHECK-NEXT: adds r4, r5, r3 -; CHECK-NEXT: adc.w r3, r6, r7 -; CHECK-NEXT: vmov r6, r5, d5 -; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov r3, r7, d3 -; CHECK-NEXT: vmov q4[2], q4[0], r4, r12 -; CHECK-NEXT: adds r6, r6, r3 -; CHECK-NEXT: adc.w r3, r5, r7 -; CHECK-NEXT: vmov r5, r7, d11 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov r3, r12, d7 -; CHECK-NEXT: adds r4, r5, r3 -; CHECK-NEXT: adc.w r3, r7, r12 -; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov q4[3], q4[1], r4, r6 -; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vhadd.u32 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB17_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r7, pc} entry: br label %vector.body @@ -747,21 +421,12 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #64 -; CHECK-NEXT: vmov.i16 q0, #0x1 ; CHECK-NEXT: .LBB18_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q1, [r1, #8] -; CHECK-NEXT: vldrb.u16 q2, [r0, #8] -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vldrb.u16 q2, [r0], #16 -; CHECK-NEXT: vadd.i16 q1, q1, q0 -; CHECK-NEXT: vshr.u16 q1, q1, #1 -; CHECK-NEXT: vstrb.16 q1, [r2, #8] -; CHECK-NEXT: vldrb.u16 q1, [r1], #16 -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q1, q1, q0 -; CHECK-NEXT: vshr.u16 q1, q1, #1 -; CHECK-NEXT: vstrb.16 q1, [r2], #16 +; CHECK-NEXT: vldrb.u8 q0, [r1], #16 +; CHECK-NEXT: vldrb.u8 q1, [r0], #16 +; CHECK-NEXT: vrhadd.u8 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB18_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -799,21 +464,12 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: vmov.i32 q0, #0x1 ; CHECK-NEXT: .LBB19_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vldrh.u32 q2, [r0, #8] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.u32 q2, [r0], #16 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vshr.u32 q1, q1, #1 -; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vldrh.u32 q1, [r1], #16 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vshr.u32 q1, q1, #1 -; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r1], #16 +; CHECK-NEXT: vldrh.u16 q1, [r0], #16 +; CHECK-NEXT: vrhadd.u16 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB19_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -848,61 +504,18 @@ define void @vrhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vrhadd_loop_s32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: .LBB20_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 -; CHECK-NEXT: vldrw.u32 q4, [r0], #16 -; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s8, s18 -; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s10, s19 -; CHECK-NEXT: vand q1, q1, q0 -; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov r3, r12, d2 -; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vmov.f32 s18, s17 -; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vand q5, q4, q0 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adc.w r4, r5, r12 -; CHECK-NEXT: adds.w r12, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: vmov r5, r6, d10 -; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, r4, d6 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds r6, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q4[2], q4[0], r6, r12 -; CHECK-NEXT: vmov r3, r6, d3 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds.w r12, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: vmov r5, r6, d11 -; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, r4, d7 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds r6, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q4[3], q4[1], r6, r12 -; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vrhadd.u32 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB20_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r7, pc} entry: br label %vector.body @@ -937,21 +550,12 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #64 -; CHECK-NEXT: vmov.i16 q0, #0x1 ; CHECK-NEXT: .LBB21_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q1, [r1, #8] -; CHECK-NEXT: vldrb.u16 q2, [r0, #8] -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vldrb.u16 q2, [r0], #16 -; CHECK-NEXT: vadd.i16 q1, q1, q0 -; CHECK-NEXT: vshr.u16 q1, q1, #1 -; CHECK-NEXT: vstrb.16 q1, [r2, #8] -; CHECK-NEXT: vldrb.u16 q1, [r1], #16 -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q1, q1, q0 -; CHECK-NEXT: vshr.u16 q1, q1, #1 -; CHECK-NEXT: vstrb.16 q1, [r2], #16 +; CHECK-NEXT: vldrb.u8 q0, [r1], #16 +; CHECK-NEXT: vldrb.u8 q1, [r0], #16 +; CHECK-NEXT: vrhadd.u8 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB21_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -989,21 +593,12 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: vmov.i32 q0, #0x1 ; CHECK-NEXT: .LBB22_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vldrh.u32 q2, [r0, #8] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.u32 q2, [r0], #16 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vshr.u32 q1, q1, #1 -; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vldrh.u32 q1, [r1], #16 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vshr.u32 q1, q1, #1 -; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r1], #16 +; CHECK-NEXT: vldrh.u16 q1, [r0], #16 +; CHECK-NEXT: vrhadd.u16 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB22_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -1038,61 +633,18 @@ define void @vrhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vrhadd_loop_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: .LBB23_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 -; CHECK-NEXT: vldrw.u32 q4, [r0], #16 -; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s8, s18 -; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s10, s19 -; CHECK-NEXT: vand q1, q1, q0 -; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov r3, r12, d2 -; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vmov.f32 s18, s17 -; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vand q5, q4, q0 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adc.w r4, r5, r12 -; CHECK-NEXT: adds.w r12, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: vmov r5, r6, d10 -; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, r4, d6 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds r6, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q4[2], q4[0], r6, r12 -; CHECK-NEXT: vmov r3, r6, d3 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds.w r12, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: vmov r5, r6, d11 -; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, r4, d7 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds r6, r3, #1 -; CHECK-NEXT: adc r3, r4, #0 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q4[3], q4[1], r6, r12 -; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vrhadd.u32 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB23_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r7, pc} entry: br label %vector.body