diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -7513,6 +7513,28 @@ return true; } +static bool isTruncMask(ArrayRef M, EVT VT, bool Top, bool SingleSource) { + unsigned NumElts = VT.getVectorNumElements(); + // Make sure the mask has the right size. + if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) + return false; + + // Half-width truncation patterns (e.g. v4i32 -> v8i16): + // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6> + // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14> + // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7> + // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15> + int Ofs = Top ? 1 : 0; + int Upper = SingleSource ? 0 : NumElts; + for (unsigned i = 0, e = NumElts / 2; i != e; ++i) { + if (M[i] >= 0 && M[i] != ((i * 2) + Ofs)) + return false; + if (M[i + e] >= 0 && M[i + e] != ((i * 2) + Ofs + Upper)) + return false; + } + return true; +} + static bool isVMOVNMask(ArrayRef M, EVT VT, bool Top, bool SingleSource) { unsigned NumElts = VT.getVectorNumElements(); // Make sure the mask has the right size. @@ -8365,6 +8387,11 @@ (isVMOVNMask(M, VT, true, false) || isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true))) return true; + else if (Subtarget->hasMVEIntegerOps() && + (isTruncMask(M, VT, false, false) || + isTruncMask(M, VT, false, true) || + isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true))) + return true; else return false; } @@ -8837,10 +8864,29 @@ } } - if (ST->hasMVEIntegerOps() && EltSize <= 32) + if (ST->hasMVEIntegerOps() && EltSize <= 32) { if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG)) return V; + for (bool Top : {false, true}) { + for (bool SingleSource : {false, true}) { + if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) { + MVT FromSVT = MVT::getIntegerVT(EltSize * 2); + MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2); + SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1); + SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, + SingleSource ? V1 : V2); + if (Top) { + SDValue Amt = DAG.getConstant(EltSize, dl, FromVT); + Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt); + Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt); + } + return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi); + } + } + } + } + // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -1274,56 +1274,53 @@ define arm_aapcs_vfpcc void @half_short_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { ; CHECK-LABEL: half_short_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB8_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader +; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: mov r9, r1 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB8_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB8_6 ; CHECK-NEXT: .LBB8_3: @ %vector.ph -; CHECK-NEXT: bic r7, r3, #3 -; CHECK-NEXT: str r7, [sp] @ 4-byte Spill -; CHECK-NEXT: subs r6, r7, #4 -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: bic r12, r3, #3 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: sub.w r7, r12, #4 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: mov r6, r9 +; CHECK-NEXT: mov r7, r8 ; CHECK-NEXT: .LBB8_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q0, [r5], #8 -; CHECK-NEXT: ldr.w r9, [r4] -; CHECK-NEXT: ldr.w r10, [r4, #4] -; CHECK-NEXT: adds r4, #8 -; CHECK-NEXT: vmov r7, r12, d0 -; CHECK-NEXT: vmov.32 q1[0], r9 -; CHECK-NEXT: vmov r11, r8, d1 -; CHECK-NEXT: vmov.16 q0[0], r7 -; CHECK-NEXT: vmov.16 q0[1], r12 -; CHECK-NEXT: vmov.32 q1[1], r10 -; CHECK-NEXT: vmov.16 q0[2], r11 -; CHECK-NEXT: vmov.16 q0[3], r8 +; CHECK-NEXT: vldrh.u32 q0, [r6], #8 +; CHECK-NEXT: ldr r4, [r5] +; CHECK-NEXT: ldr r2, [r5, #4] +; CHECK-NEXT: adds r5, #8 +; CHECK-NEXT: vstrh.32 q0, [r1] +; CHECK-NEXT: vmov.32 q1[0], r4 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vmov.32 q1[1], r2 ; CHECK-NEXT: vcvt.f16.s16 q0, q0 ; CHECK-NEXT: vmul.f16 q0, q1, q0 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vstrb.8 q0, [r6], #16 +; CHECK-NEXT: vstrb.8 q0, [r7], #16 ; CHECK-NEXT: le lr, .LBB8_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload -; CHECK-NEXT: cmp r7, r3 +; CHECK-NEXT: cmp r12, r3 ; CHECK-NEXT: beq .LBB8_8 ; CHECK-NEXT: .LBB8_6: @ %for.body.preheader13 -; CHECK-NEXT: sub.w lr, r3, r7 -; CHECK-NEXT: add.w r0, r0, r7, lsl #1 -; CHECK-NEXT: add.w r1, r1, r7, lsl #1 -; CHECK-NEXT: add.w r2, r2, r7, lsl #2 +; CHECK-NEXT: sub.w lr, r3, r12 +; CHECK-NEXT: add.w r0, r0, r12, lsl #1 +; CHECK-NEXT: add.w r1, r9, r12, lsl #1 +; CHECK-NEXT: add.w r2, r8, r12, lsl #2 ; CHECK-NEXT: .LBB8_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r3, [r1], #2 @@ -1336,8 +1333,8 @@ ; CHECK-NEXT: vstmia r2!, {s0} ; CHECK-NEXT: le lr, .LBB8_7 ; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -319,27 +319,20 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle2step_i16(<16 x i16> %src) { ; CHECK-LABEL: shuffle2step_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s9, s2 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmovx.f16 s10, s4 -; CHECK-NEXT: vins.f16 s9, s0 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vins.f16 s10, s0 -; CHECK-NEXT: vmovx.f16 s11, s6 -; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vmov.f32 s13, s2 -; CHECK-NEXT: vins.f16 s11, s0 -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vmov.f32 s15, s6 -; CHECK-NEXT: vadd.i16 q0, q3, q2 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vshr.u32 q2, q1, #16 +; CHECK-NEXT: vstrh.32 q2, [r0, #8] +; CHECK-NEXT: vshr.u32 q2, q0, #16 +; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: vstrh.32 q2, [r0] +; CHECK-NEXT: vstrh.32 q1, [r1, #8] +; CHECK-NEXT: vstrh.32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> @@ -615,71 +608,20 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle2step_i8(<32 x i8> %src) { ; CHECK-LABEL: shuffle2step_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.8 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.8 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.8 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.8 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.8 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.8 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.8 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q2[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q2[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q2[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q2[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q2[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q2[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q2[14], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.8 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.8 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.8 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.8 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.8 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.8 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.8 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.8 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q3[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q3[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q3[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q3[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q3[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q3[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q3[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q3[15], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q2[15], r0 -; CHECK-NEXT: vadd.i8 q0, q2, q3 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vshr.u16 q2, q1, #8 +; CHECK-NEXT: vstrb.16 q2, [r0, #8] +; CHECK-NEXT: vshr.u16 q2, q0, #8 +; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: vstrb.16 q2, [r0] +; CHECK-NEXT: vstrb.16 q1, [r1, #8] +; CHECK-NEXT: vstrb.16 q0, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vadd.i8 q0, q1, q0 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <32 x i8> %src, <32 x i8> undef, <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -204,29 +204,23 @@ define void @vld2_v8i16_align1(ptr %src, ptr %dst) { ; CHECK-LABEL: vld2_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: vldrb.u8 q2, [r0, #16] -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s5, s2 -; CHECK-NEXT: vmovx.f16 s6, s3 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vmovx.f16 s6, s8 -; CHECK-NEXT: vins.f16 s6, s12 -; CHECK-NEXT: vmovx.f16 s7, s10 -; CHECK-NEXT: vmovx.f16 s12, s11 -; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vins.f16 s8, s9 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vins.f16 s7, s12 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s10 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: add r2, sp, #16 +; CHECK-NEXT: vshr.u32 q1, q0, #16 +; CHECK-NEXT: vstrh.32 q1, [r2, #8] +; CHECK-NEXT: vldrb.u8 q1, [r0] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vshr.u32 q2, q1, #16 +; CHECK-NEXT: vstrh.32 q2, [r2] +; CHECK-NEXT: vstrh.32 q0, [r0, #8] +; CHECK-NEXT: vstrh.32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: %l1 = load <16 x i16>, ptr %src, align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -246,40 +246,40 @@ define void @vld4_v4i16(ptr %src, ptr %dst) { ; CHECK-LABEL: vld4_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] +; CHECK-NEXT: mov r4, sp +; CHECK-NEXT: vshr.u32 q1, q0, #16 +; CHECK-NEXT: vmov.u16 r3, q0[3] +; CHECK-NEXT: vstrh.32 q1, [r4, #8] ; CHECK-NEXT: vldrh.u16 q1, [r0] -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov q3[3], q3[1], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov q4[3], q4[1], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov q3[3], q3[1], r2, r0 -; CHECK-NEXT: vadd.i32 q0, q3, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vshr.u32 q2, q1, #16 +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vstrh.32 q2, [r4] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r3, q1[7] +; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q1[2] +; CHECK-NEXT: vstrh.32 q0, [r0, #8] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: vmov.u16 r3, q1[6] +; CHECK-NEXT: vstrh.32 q1, [r0] +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vadd.i32 q0, q3, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vstrh.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <16 x i16>, ptr %src, align 2 %s1 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> @@ -490,79 +490,57 @@ define void @vld4_v8i8(ptr %src, ptr %dst) { ; CHECK-LABEL: vld4_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrb.u8 q1, [r0] +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] -; CHECK-NEXT: vmov.u8 r2, q1[3] -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u8 r2, q1[11] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u8 r2, q1[15] -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vadd.i16 q2, q3, q2 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vadd.i16 q0, q3, q4 -; CHECK-NEXT: vadd.i16 q0, q0, q2 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vshr.u16 q1, q0, #8 +; CHECK-NEXT: vstrb.16 q1, [r2, #8] +; CHECK-NEXT: vldrb.u8 q2, [r0] +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vshr.u16 q1, q2, #8 +; CHECK-NEXT: vmov.u8 r3, q2[3] +; CHECK-NEXT: vstrb.16 q1, [r2] +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov.u8 r3, q2[7] +; CHECK-NEXT: vstrb.16 q0, [r0, #8] +; CHECK-NEXT: vmov.16 q1[1], r3 +; CHECK-NEXT: vmov.u8 r3, q2[11] +; CHECK-NEXT: vmov.16 q1[2], r3 +; CHECK-NEXT: vmov.u8 r3, q2[15] +; CHECK-NEXT: vmov.16 q1[3], r3 +; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: vmov.16 q1[4], r3 +; CHECK-NEXT: vmov.u8 r3, q0[7] +; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov.u8 r3, q0[11] +; CHECK-NEXT: vmov.16 q1[6], r3 +; CHECK-NEXT: vmov.u8 r3, q0[15] +; CHECK-NEXT: vmov.16 q1[7], r3 +; CHECK-NEXT: vmov.u8 r3, q2[2] +; CHECK-NEXT: vmov.16 q3[0], r3 +; CHECK-NEXT: vmov.u8 r3, q2[6] +; CHECK-NEXT: vmov.16 q3[1], r3 +; CHECK-NEXT: vmov.u8 r3, q2[10] +; CHECK-NEXT: vmov.16 q3[2], r3 +; CHECK-NEXT: vmov.u8 r3, q2[14] +; CHECK-NEXT: vmov.16 q3[3], r3 +; CHECK-NEXT: vmov.u8 r3, q0[2] +; CHECK-NEXT: vmov.16 q3[4], r3 +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: vmov.16 q3[5], r3 +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: vmov.16 q3[6], r3 +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: vstrb.16 q2, [r0] +; CHECK-NEXT: vmov.16 q3[7], r3 +; CHECK-NEXT: vadd.i16 q0, q3, q1 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vadd.i16 q1, q2, q1 +; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrb.16 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i8>, ptr %src, align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll @@ -147,51 +147,45 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_c(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vqdmulh_v8i16_c: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.u16 r1, q2[1] -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmullb.s16 q0, q3, q0 -; CHECK-NEXT: vshl.i32 q0, q0, #10 -; CHECK-NEXT: vshr.s32 q0, q0, #10 -; CHECK-NEXT: vshr.s32 q3, q0, #15 -; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, r1, d7 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov.u16 r1, q2[4] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.u16 r1, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q1[6] ; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q1[7] ; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmullb.s16 q1, q2, q3 -; CHECK-NEXT: vshl.i32 q1, q1, #10 -; CHECK-NEXT: vshr.s32 q1, q1, #10 -; CHECK-NEXT: vshr.s32 q1, q1, #15 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.16 q0[7], r1 +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmullb.s16 q2, q3, q2 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vshl.i32 q2, q2, #10 +; CHECK-NEXT: vshr.s32 q2, q2, #10 +; CHECK-NEXT: vshr.s32 q2, q2, #15 +; CHECK-NEXT: vstrh.32 q2, [r0, #8] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[2] +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[3] +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: vmullb.s16 q0, q0, q2 +; CHECK-NEXT: vshl.i32 q0, q0, #10 +; CHECK-NEXT: vshr.s32 q0, q0, #10 +; CHECK-NEXT: vshr.s32 q0, q0, #15 +; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %l2 = sext <8 x i16> %s0 to <8 x i22> diff --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll @@ -156,51 +156,45 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_c(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vqdmulh_v8i16_c: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.u16 r1, q2[1] -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmullb.s16 q0, q3, q0 -; CHECK-NEXT: vshl.i32 q0, q0, #10 -; CHECK-NEXT: vshr.s32 q0, q0, #10 -; CHECK-NEXT: vshr.s32 q3, q0, #15 -; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, r1, d7 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov.u16 r1, q2[4] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.u16 r1, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q1[6] ; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q1[7] ; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmullb.s16 q1, q2, q3 -; CHECK-NEXT: vshl.i32 q1, q1, #10 -; CHECK-NEXT: vshr.s32 q1, q1, #10 -; CHECK-NEXT: vshr.s32 q1, q1, #15 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.16 q0[7], r1 +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmullb.s16 q2, q3, q2 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vshl.i32 q2, q2, #10 +; CHECK-NEXT: vshr.s32 q2, q2, #10 +; CHECK-NEXT: vshr.s32 q2, q2, #15 +; CHECK-NEXT: vstrh.32 q2, [r0, #8] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[2] +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[3] +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: vmullb.s16 q0, q0, q2 +; CHECK-NEXT: vshl.i32 q0, q0, #10 +; CHECK-NEXT: vshr.s32 q0, q0, #10 +; CHECK-NEXT: vshr.s32 q0, q0, #15 +; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %l2 = sext <8 x i16> %s0 to <8 x i22> diff --git a/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll b/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll --- a/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll +++ b/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll @@ -45,42 +45,32 @@ define void @foo_int8_int32_double(ptr %dest, ptr readonly %src, i32 %n) { ; CHECK-LE-LABEL: foo_int8_int32_double: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vldrh.u16 q1, [r1] -; CHECK-LE-NEXT: vmov r2, r3, d2 -; CHECK-LE-NEXT: vmov.16 q0[0], r2 -; CHECK-LE-NEXT: vmov.16 q0[1], r3 -; CHECK-LE-NEXT: vmov r2, r3, d3 -; CHECK-LE-NEXT: vldrh.u16 q1, [r1, #16] -; CHECK-LE-NEXT: vmov.16 q0[2], r2 -; CHECK-LE-NEXT: vmov.16 q0[3], r3 -; CHECK-LE-NEXT: vmov r1, r2, d2 -; CHECK-LE-NEXT: vmov.16 q0[4], r1 -; CHECK-LE-NEXT: vmov.16 q0[5], r2 -; CHECK-LE-NEXT: vmov r1, r2, d3 -; CHECK-LE-NEXT: vmov.16 q0[6], r1 -; CHECK-LE-NEXT: vmov.16 q0[7], r2 +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 +; CHECK-LE-NEXT: vldrh.u16 q0, [r1, #16] +; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: vstrh.32 q0, [r2, #8] +; CHECK-LE-NEXT: vldrh.u16 q0, [r1] +; CHECK-LE-NEXT: vstrh.32 q0, [r2] +; CHECK-LE-NEXT: vldrw.u32 q0, [r2] ; CHECK-LE-NEXT: vstrb.16 q0, [r0] +; CHECK-LE-NEXT: add sp, #16 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: foo_int8_int32_double: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 +; CHECK-BE-NEXT: vldrb.u8 q0, [r1, #16] +; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vstrh.32 q0, [r2, #8] ; CHECK-BE-NEXT: vldrb.u8 q0, [r1] -; CHECK-BE-NEXT: vrev32.8 q1, q0 -; CHECK-BE-NEXT: vmov r2, r3, d2 -; CHECK-BE-NEXT: vmov.16 q0[0], r2 -; CHECK-BE-NEXT: vmov.16 q0[1], r3 -; CHECK-BE-NEXT: vmov r2, r3, d3 -; CHECK-BE-NEXT: vldrb.u8 q1, [r1, #16] -; CHECK-BE-NEXT: vmov.16 q0[2], r2 -; CHECK-BE-NEXT: vmov.16 q0[3], r3 -; CHECK-BE-NEXT: vrev32.8 q1, q1 -; CHECK-BE-NEXT: vmov r1, r2, d2 -; CHECK-BE-NEXT: vmov.16 q0[4], r1 -; CHECK-BE-NEXT: vmov.16 q0[5], r2 -; CHECK-BE-NEXT: vmov r1, r2, d3 -; CHECK-BE-NEXT: vmov.16 q0[6], r1 -; CHECK-BE-NEXT: vmov.16 q0[7], r2 +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vstrh.32 q0, [r2] +; CHECK-BE-NEXT: vldrh.u16 q0, [r2] ; CHECK-BE-NEXT: vstrb.16 q0, [r0] +; CHECK-BE-NEXT: add sp, #16 ; CHECK-BE-NEXT: bx lr entry: %wide.load = load <8 x i32>, ptr %src, align 2