diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -281,6 +281,8 @@ setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); + setOperationAction(ISD::ABDS, VT, Legal); + setOperationAction(ISD::ABDU, VT, Legal); // No native support for these. setOperationAction(ISD::UDIV, VT, Expand); @@ -14616,6 +14618,8 @@ case ARMISD::VQDMULH: case ISD::MULHS: case ISD::MULHU: + case ISD::ABDS: + case ISD::ABDU: break; default: return SDValue(); diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2131,36 +2131,31 @@ let validForTailPredication = 1; } -multiclass MVE_VABD_m { +multiclass MVE_VABD_m { def "" : MVE_VABD_int; defvar Inst = !cast(NAME); let Predicates = [HasMVEInt] in { + defm : MVE_TwoOpPattern(NAME)>; + // Unpredicated absolute difference def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; - - // Predicated absolute difference - def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$inactive))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - ARMVCCThen, (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$inactive)))>; } } -multiclass MVE_VABD - : MVE_VABD_m; +multiclass MVE_VABD + : MVE_VABD_m; -defm MVE_VABDs8 : MVE_VABD; -defm MVE_VABDs16 : MVE_VABD; -defm MVE_VABDs32 : MVE_VABD; -defm MVE_VABDu8 : MVE_VABD; -defm MVE_VABDu16 : MVE_VABD; -defm MVE_VABDu32 : MVE_VABD; +defm MVE_VABDs8 : MVE_VABD; +defm MVE_VABDs16 : MVE_VABD; +defm MVE_VABDs32 : MVE_VABD; +defm MVE_VABDu8 : MVE_VABD; +defm MVE_VABDu16 : MVE_VABD; +defm MVE_VABDu32 : MVE_VABD; class MVE_VRHADD_Base size, list pattern=[]> : MVE_int<"vrhadd", suffix, size, pattern> { diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -4,15 +4,7 @@ define arm_aapcs_vfpcc <16 x i8> @vabd_s8(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: vabd_s8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmovlt.s8 q2, q1 -; CHECK-NEXT: vmovlt.s8 q3, q0 -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vsub.i16 q2, q3, q2 -; CHECK-NEXT: vsub.i16 q0, q0, q1 -; CHECK-NEXT: vabs.s16 q2, q2 -; CHECK-NEXT: vabs.s16 q0, q0 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vabd.s8 q0, q0, q1 ; CHECK-NEXT: bx lr %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> @@ -27,15 +19,7 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vabd_s16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmovlt.s16 q2, q1 -; CHECK-NEXT: vmovlt.s16 q3, q0 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vsub.i32 q2, q3, q2 -; CHECK-NEXT: vsub.i32 q0, q0, q1 -; CHECK-NEXT: vabs.s32 q2, q2 -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vabd.s16 q0, q0, q1 ; CHECK-NEXT: bx lr %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> %sextsrc2 = sext <8 x i16> %src2 to <8 x i32> @@ -50,46 +34,7 @@ define arm_aapcs_vfpcc <4 x i32> @vabd_s32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vabd_s32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s16, s6 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: subs r0, r0, r2 -; CHECK-NEXT: sbc.w r1, r1, r2, asr #31 -; CHECK-NEXT: add.w r0, r0, r1, asr #31 -; CHECK-NEXT: eor.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: subs r1, r1, r3 -; CHECK-NEXT: sbc.w r2, r2, r3, asr #31 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: add.w r1, r1, r2, asr #31 -; CHECK-NEXT: eor.w r1, r1, r2, asr #31 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: subs r0, r0, r2 -; CHECK-NEXT: sbc.w r1, r1, r2, asr #31 -; CHECK-NEXT: add.w r0, r0, r1, asr #31 -; CHECK-NEXT: eor.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: subs r1, r1, r3 -; CHECK-NEXT: sbc.w r2, r2, r3, asr #31 -; CHECK-NEXT: add.w r1, r1, r2, asr #31 -; CHECK-NEXT: eor.w r1, r1, r2, asr #31 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vabd.s32 q0, q0, q1 ; CHECK-NEXT: bx lr %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -104,15 +49,7 @@ define arm_aapcs_vfpcc <16 x i8> @vabd_u8(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: vabd_u8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmovlt.u8 q2, q1 -; CHECK-NEXT: vmovlt.u8 q3, q0 -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vsub.i16 q2, q3, q2 -; CHECK-NEXT: vsub.i16 q0, q0, q1 -; CHECK-NEXT: vabs.s16 q2, q2 -; CHECK-NEXT: vabs.s16 q0, q0 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vabd.u8 q0, q0, q1 ; CHECK-NEXT: bx lr %zextsrc1 = zext <16 x i8> %src1 to <16 x i16> %zextsrc2 = zext <16 x i8> %src2 to <16 x i16> @@ -127,15 +64,7 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vabd_u16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmovlt.u16 q2, q1 -; CHECK-NEXT: vmovlt.u16 q3, q0 -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vsub.i32 q2, q3, q2 -; CHECK-NEXT: vsub.i32 q0, q0, q1 -; CHECK-NEXT: vabs.s32 q2, q2 -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vabd.u16 q0, q0, q1 ; CHECK-NEXT: bx lr %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> %zextsrc2 = zext <8 x i16> %src2 to <8 x i32> @@ -150,46 +79,7 @@ define arm_aapcs_vfpcc <4 x i32> @vabd_u32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vabd_u32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.i64 q4, #0xffffffff -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vand q2, q2, q4 -; CHECK-NEXT: vand q3, q3, q4 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vand q4, q0, q4 -; CHECK-NEXT: subs r0, r2, r0 -; CHECK-NEXT: sbc.w r1, r3, r1 -; CHECK-NEXT: add.w r0, r0, r1, asr #31 -; CHECK-NEXT: eor.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vmov r3, r0, d8 -; CHECK-NEXT: subs r1, r3, r1 -; CHECK-NEXT: sbcs r0, r2 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: add.w r1, r1, r0, asr #31 -; CHECK-NEXT: eor.w r0, r1, r0, asr #31 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r12 -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: subs r0, r2, r0 -; CHECK-NEXT: sbc.w r1, r3, r1 -; CHECK-NEXT: add.w r0, r0, r1, asr #31 -; CHECK-NEXT: eor.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov r1, r2, d3 -; CHECK-NEXT: vmov r3, r0, d9 -; CHECK-NEXT: subs r1, r3, r1 -; CHECK-NEXT: sbcs r0, r2 -; CHECK-NEXT: add.w r1, r1, r0, asr #31 -; CHECK-NEXT: eor.w r0, r1, r0, asr #31 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vabd.u32 q0, q0, q1 ; CHECK-NEXT: bx lr %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> @@ -209,26 +99,37 @@ ; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.s32 q0, [r1, #12] -; CHECK-NEXT: vldrb.s32 q1, [r0, #12] -; CHECK-NEXT: vsub.i32 q0, q1, q0 -; CHECK-NEXT: vldrb.s32 q1, [r0, #8] -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vstrb.32 q0, [r2, #12] -; CHECK-NEXT: vldrb.s32 q0, [r1, #8] -; CHECK-NEXT: vsub.i32 q0, q1, q0 -; CHECK-NEXT: vldrb.s32 q1, [r0, #4] -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vstrb.32 q0, [r2, #8] -; CHECK-NEXT: vldrb.s32 q0, [r1, #4] -; CHECK-NEXT: vsub.i32 q0, q1, q0 -; CHECK-NEXT: vldrb.s32 q1, [r0], #16 -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vstrb.32 q0, [r2, #4] -; CHECK-NEXT: vldrb.s32 q0, [r1], #16 -; CHECK-NEXT: vsub.i32 q0, q1, q0 -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vstrb.32 q0, [r2], #16 +; CHECK-NEXT: vldrb.u8 q0, [r1], #16 +; CHECK-NEXT: vldrb.u8 q1, [r0], #16 +; CHECK-NEXT: vabd.s8 q0, q1, q0 +; CHECK-NEXT: vmov.u8 r12, q0[14] +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[15] +; CHECK-NEXT: vmov.u8 r3, q0[13] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[10] +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vstrb.32 q1, [r2, #12] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[11] +; CHECK-NEXT: vmov.u8 r3, q0[9] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[6] +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vstrb.32 q1, [r2, #8] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[7] +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[2] +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: vstrb.32 q1, [r2, #4] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[3] +; CHECK-NEXT: vmov.u8 r3, q0[1] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: vstrb.32 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -269,16 +170,23 @@ ; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.s32 q0, [r1, #8] -; CHECK-NEXT: vldrh.s32 q1, [r0, #8] -; CHECK-NEXT: vsub.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.s32 q1, [r0], #16 -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vstrh.32 q0, [r2, #8] -; CHECK-NEXT: vldrh.s32 q0, [r1], #16 -; CHECK-NEXT: vsub.i32 q0, q1, q0 -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vstrh.32 q0, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r1], #16 +; CHECK-NEXT: vldrh.u16 q1, [r0], #16 +; CHECK-NEXT: vabd.s16 q0, q1, q0 +; CHECK-NEXT: vmov.u16 r12, q0[6] +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.u16 r12, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: vmov.u16 r12, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: vstrh.32 q1, [r2, #8] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.u16 r12, q0[3] +; CHECK-NEXT: vmov.u16 r3, q0[1] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: vstrh.32 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -419,26 +327,37 @@ ; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u32 q0, [r1, #12] -; CHECK-NEXT: vldrb.u32 q1, [r0, #12] -; CHECK-NEXT: vsub.i32 q0, q1, q0 -; CHECK-NEXT: vldrb.u32 q1, [r0, #8] -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vstrb.32 q0, [r2, #12] -; CHECK-NEXT: vldrb.u32 q0, [r1, #8] -; CHECK-NEXT: vsub.i32 q0, q1, q0 -; CHECK-NEXT: vldrb.u32 q1, [r0, #4] -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vstrb.32 q0, [r2, #8] -; CHECK-NEXT: vldrb.u32 q0, [r1, #4] -; CHECK-NEXT: vsub.i32 q0, q1, q0 -; CHECK-NEXT: vldrb.u32 q1, [r0], #16 -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vstrb.32 q0, [r2, #4] -; CHECK-NEXT: vldrb.u32 q0, [r1], #16 -; CHECK-NEXT: vsub.i32 q0, q1, q0 -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vstrb.32 q0, [r2], #16 +; CHECK-NEXT: vldrb.u8 q0, [r1], #16 +; CHECK-NEXT: vldrb.u8 q1, [r0], #16 +; CHECK-NEXT: vabd.u8 q0, q1, q0 +; CHECK-NEXT: vmov.u8 r12, q0[14] +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[15] +; CHECK-NEXT: vmov.u8 r3, q0[13] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[10] +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vstrb.32 q1, [r2, #12] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[11] +; CHECK-NEXT: vmov.u8 r3, q0[9] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[6] +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vstrb.32 q1, [r2, #8] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[7] +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[2] +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: vstrb.32 q1, [r2, #4] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.u8 r12, q0[3] +; CHECK-NEXT: vmov.u8 r3, q0[1] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: vstrb.32 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -479,16 +398,23 @@ ; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q0, [r1, #8] -; CHECK-NEXT: vldrh.u32 q1, [r0, #8] -; CHECK-NEXT: vsub.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.u32 q1, [r0], #16 -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vstrh.32 q0, [r2, #8] -; CHECK-NEXT: vldrh.u32 q0, [r1], #16 -; CHECK-NEXT: vsub.i32 q0, q1, q0 -; CHECK-NEXT: vabs.s32 q0, q0 -; CHECK-NEXT: vstrh.32 q0, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r1], #16 +; CHECK-NEXT: vldrh.u16 q1, [r0], #16 +; CHECK-NEXT: vabd.u16 q0, q1, q0 +; CHECK-NEXT: vmov.u16 r12, q0[6] +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.u16 r12, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: vmov.u16 r12, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: vstrh.32 q1, [r2, #8] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.u16 r12, q0[3] +; CHECK-NEXT: vmov.u16 r3, q0[1] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: vstrh.32 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc}