diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -8194,8 +8194,8 @@ Input = Op->getOperand(1); Elt -= 4; } - SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input); - Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast, + SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input); + Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast, DAG.getConstant(Elt, dl, MVT::i32)); } } @@ -8214,19 +8214,70 @@ Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); SDValue NewShuffle = DAG.getVectorShuffle( VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); - SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle); + SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle); for (int Part = 0; Part < 4; ++Part) if (!Parts[Part]) - Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast, DAG.getConstant(Part, dl, MVT::i32)); } // Build a vector out of the various parts and bitcast it back to the original // type. - SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts); + SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts); return DAG.getBitcast(VT, NewVec); } +static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, + ArrayRef ShuffleMask, + SelectionDAG &DAG) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + + // An One-Off Identity mask is one that is mostly an identity mask from as + // single source but contains a single element out-of-place, either from a + // different vector or from another position in the same vector. As opposed to + // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert + // pair directly. + auto isOneOffIdentityMask = [](ArrayRef Mask, EVT VT, int BaseOffset, + int &OffElement) { + OffElement = -1; + int NonUndef = 0; + for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) { + if (Mask[i] == -1) + continue; + NonUndef++; + if (Mask[i] != i + BaseOffset) { + if (OffElement == -1) + OffElement = i; + else + return false; + } + } + return NonUndef > 2 && OffElement != -1; + }; + int OffElement; + SDValue VInput; + if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement)) + VInput = V1; + else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement)) + VInput = V2; + else + return SDValue(); + + SDLoc dl(Op); + EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16 + ? MVT::i32 + : VT.getScalarType(); + SDValue Elt = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, SVT, + ShuffleMask[OffElement] < (int)NumElts ? V1 : V2, + DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl)); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt, + DAG.getVectorIdxConstant(OffElement % NumElts, dl)); +} + static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); @@ -8360,6 +8411,10 @@ } } + if (ST->hasMVEIntegerOps() && EltSize <= 32) + if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG)) + return V; + // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -1481,15 +1481,11 @@ ; CHECK-NEXT: vmovx.f16 s6, s12 ; CHECK-NEXT: vfma.f16 q3, q6, r4 ; CHECK-NEXT: vstr.16 s6, [r5, #2] -; CHECK-NEXT: vmov.f32 s12, s13 ; CHECK-NEXT: vmovx.f16 s6, s13 -; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: vmov.f32 s12, s13 ; CHECK-NEXT: vins.f16 s12, s6 -; CHECK-NEXT: vmov.16 q7[2], r7 ; CHECK-NEXT: adds r5, #4 -; CHECK-NEXT: vmov.f32 s13, s29 -; CHECK-NEXT: vmov.f32 s14, s30 -; CHECK-NEXT: vmov.f32 s15, s31 +; CHECK-NEXT: vmov.16 q3[2], r7 ; CHECK-NEXT: vmov q7, q3 ; CHECK-NEXT: le lr, .LBB17_5 ; CHECK-NEXT: .LBB17_6: @ %while.end diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -72,10 +72,7 @@ define arm_aapcs_vfpcc <4 x i32> @oneoff12_i32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: oneoff12_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s5, s1 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov.f32 s0, s4 ; CHECK-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> @@ -104,6 +101,16 @@ ret <4 x i32> %out } +define arm_aapcs_vfpcc <4 x i32> @oneoffundef_i32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: oneoffundef_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + ret <4 x i32> %out +} + define arm_aapcs_vfpcc <4 x i32> @shuffle2step_i32(<8 x i32> %src) { ; CHECK-LABEL: shuffle2step_i32: ; CHECK: @ %bb.0: @ %entry @@ -126,27 +133,22 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle3step_i32(<16 x i32> %src) { ; CHECK-LABEL: shuffle3step_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s13, s4 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmov.f32 s17, s3 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vdup.32 q5, r0 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.f32 s10, s8 -; CHECK-NEXT: vadd.i32 q3, q4, q3 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vadd.i32 q0, q3, q1 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s14, s8 +; CHECK-NEXT: vmov.f32 s15, s11 +; CHECK-NEXT: vmov.f32 s16, s1 +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vadd.i32 q0, q0, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q3 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> @@ -267,10 +269,7 @@ ; CHECK-LABEL: oneoff11_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> @@ -280,12 +279,8 @@ define arm_aapcs_vfpcc <8 x i16> @oneoff12_i16(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: oneoff12_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmovnb.i32 q2, q1 -; CHECK-NEXT: vmov.f32 s9, s1 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.16 q0[0], r0 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> @@ -306,19 +301,27 @@ define arm_aapcs_vfpcc <8 x i16> @oneoff22_i16(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: oneoff22_i16: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov.u16 r0, q1[6] ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.f32 s1, s5 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> ret <8 x i16> %out } +define arm_aapcs_vfpcc <8 x i16> @oneoffundef_i16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: oneoffundef_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + ret <8 x i16> %out +} + define arm_aapcs_vfpcc <8 x i16> @shuffle2step_i16(<16 x i16> %src) { ; CHECK-LABEL: shuffle2step_i16: ; CHECK: @ %bb.0: @ %entry @@ -379,44 +382,36 @@ ; CHECK-NEXT: vmov.u16 r0, q2[4] ; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.16 q4[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q6[1], r0 +; CHECK-NEXT: vmov.16 q4[1], r0 ; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q6[2], r0 +; CHECK-NEXT: vmov.16 q4[2], r0 ; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov.16 q4[3], r0 ; CHECK-NEXT: vmov.u16 r0, q2[7] ; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.f32 s26, s7 +; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s8 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov q4, q5 -; CHECK-NEXT: vmovnb.i32 q4, q6 -; CHECK-NEXT: vmov r1, s24 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmovnb.i32 q6, q4 +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.f32 s19, s23 ; CHECK-NEXT: vins.f16 s22, s8 -; CHECK-NEXT: vmov r1, s25 ; CHECK-NEXT: vmovx.f16 s23, s9 -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 ; CHECK-NEXT: vins.f16 s23, s11 ; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vins.f16 s8, s2 -; CHECK-NEXT: vmov.u16 r0, q1[5] ; CHECK-NEXT: vmovx.f16 s9, s3 ; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vins.f16 s9, s5 ; CHECK-NEXT: vmov.16 q2[4], r0 ; CHECK-NEXT: vmovnb.i32 q0, q2 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vadd.i16 q0, q3, q0 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s11, s23 +; CHECK-NEXT: vadd.i16 q0, q3, q2 ; CHECK-NEXT: vadd.i16 q0, q0, q4 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr @@ -631,17 +626,8 @@ define arm_aapcs_vfpcc <16 x i8> @oneoff11_i8(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: oneoff11_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.8 q1[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.8 q1[1], r0 -; CHECK-NEXT: vmov.8 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.8 q1[3], r0 -; CHECK-NEXT: vmov.f32 s5, s1 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov.8 q0[2], r0 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> @@ -652,17 +638,7 @@ ; CHECK-LABEL: oneoff12_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.8 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.8 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.8 q1[3], r0 -; CHECK-NEXT: vmov.f32 s5, s1 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov.8 q0[0], r0 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> @@ -672,18 +648,9 @@ define arm_aapcs_vfpcc <16 x i8> @oneoff21_i8(<16 x i8> %src1, <16 x i8> %src2) { ; CHECK-LABEL: oneoff21_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.f32 s1, s5 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.8 q1[3], r0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> @@ -694,21 +661,25 @@ ; CHECK-LABEL: oneoff22_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q1[8], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.8 q1[9], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.8 q1[10], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.8 q1[11], r0 -; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.8 q0[9], r0 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> ret <16 x i8> %out } +define arm_aapcs_vfpcc <16 x i8> @oneoffundef_i8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: oneoffundef_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + ret <16 x i8> %out +} + define arm_aapcs_vfpcc <16 x i8> @shuffle2step_i8(<32 x i8> %src) { ; CHECK-LABEL: shuffle2step_i8: ; CHECK: @ %bb.0: @ %entry @@ -790,83 +761,78 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.8 q3[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q3[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.8 q3[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q3[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q3[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.8 q3[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.8 q3[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.8 q3[7], r0 ; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q4[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmov.8 q4[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.8 q4[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: vmov.8 q4[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] ; CHECK-NEXT: vmov.8 q3[8], r0 ; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q4[4], r1 -; CHECK-NEXT: vmov.u8 r1, q1[0] ; CHECK-NEXT: vmov.8 q3[9], r0 ; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q4[5], r1 -; CHECK-NEXT: vmov.u8 r1, q1[3] ; CHECK-NEXT: vmov.8 q3[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q4[6], r1 -; CHECK-NEXT: vmov.u8 r1, q1[6] -; CHECK-NEXT: vmov.8 q3[11], r0 -; CHECK-NEXT: vmov.8 q4[7], r1 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 ; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q5[12], r0 +; CHECK-NEXT: vmov.8 q4[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q5[13], r0 +; CHECK-NEXT: vmov.8 q4[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.8 q5[14], r0 +; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.8 q4[15], r0 +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vmov.8 q5[11], r0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.8 q4[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.8 q4[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.8 q4[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.8 q4[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q4[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.8 q4[7], r0 ; CHECK-NEXT: vmov.u8 r0, q1[8] ; CHECK-NEXT: vmov.8 q4[8], r0 ; CHECK-NEXT: vmov.u8 r0, q1[11] ; CHECK-NEXT: vmov.8 q4[9], r0 ; CHECK-NEXT: vmov.u8 r0, q1[14] ; CHECK-NEXT: vmov.8 q4[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q4[11], r0 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.8 q4[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.8 q4[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.8 q4[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: vmov.8 q4[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: vmov.8 q4[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: vmov.8 q4[5], r1 -; CHECK-NEXT: vmov.u8 r1, q1[2] -; CHECK-NEXT: vmov.8 q4[6], r1 -; CHECK-NEXT: vmov.u8 r1, q1[5] -; CHECK-NEXT: vmov.8 q4[7], r1 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vmov q5[2], q5[0], r1, r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q6[12], r0 +; CHECK-NEXT: vmov.8 q5[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q6[13], r0 +; CHECK-NEXT: vmov.8 q5[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q6[14], r0 +; CHECK-NEXT: vmov.8 q5[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.8 q6[15], r0 -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov r0, s27 -; CHECK-NEXT: vmov q5[3], q5[1], r1, r0 +; CHECK-NEXT: vmov.8 q5[15], r0 +; CHECK-NEXT: vmov.u8 r0, q2[1] +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vmov.8 q6[11], r0 ; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vadd.i8 q3, q4, q3 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[5] ; CHECK-NEXT: vmov.8 q4[1], r0 @@ -880,32 +846,27 @@ ; CHECK-NEXT: vmov.8 q4[5], r0 ; CHECK-NEXT: vmov.u8 r0, q1[4] ; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.u8 r0, q2[6] +; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov.u8 r0, q2[9] +; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.u8 r0, q2[15] +; CHECK-NEXT: vmov.8 q0[15], r0 ; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q5[8], r0 ; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q5[9], r0 ; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q0[10], r0 +; CHECK-NEXT: vmov.8 q5[10], r0 ; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q1[7] ; CHECK-NEXT: vmov.8 q4[7], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vadd.i8 q3, q5, q3 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q1[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.8 q1[13], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q1[14], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q1[15], r0 -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vadd.i8 q0, q3, q0 +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov.f32 s19, s3 +; CHECK-NEXT: vadd.i8 q0, q3, q4 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: @@ -1171,10 +1132,7 @@ define arm_aapcs_vfpcc <4 x float> @oneoff12_f32(<4 x float> %src1, <4 x float> %src2) { ; CHECK-LABEL: oneoff12_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s5, s1 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov.f32 s0, s4 ; CHECK-NEXT: bx lr entry: %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> @@ -1227,20 +1185,19 @@ ; CHECKFP: @ %bb.0: @ %entry ; CHECKFP-NEXT: .vsave {d8, d9} ; CHECKFP-NEXT: vpush {d8, d9} -; CHECKFP-NEXT: vmov.f32 s12, s1 -; CHECKFP-NEXT: vmov.f32 s16, s0 -; CHECKFP-NEXT: vmov.f32 s13, s4 -; CHECKFP-NEXT: vmov.f32 s17, s3 -; CHECKFP-NEXT: vmov.f32 s14, s7 -; CHECKFP-NEXT: vmov.f32 s18, s6 -; CHECKFP-NEXT: vmov.f32 s4, s2 -; CHECKFP-NEXT: vmov.f32 s15, s10 -; CHECKFP-NEXT: vmov.f32 s19, s9 -; CHECKFP-NEXT: vmov.f32 s10, s8 -; CHECKFP-NEXT: vadd.f32 q3, q4, q3 -; CHECKFP-NEXT: vmov.f32 s6, s8 -; CHECKFP-NEXT: vmov.f32 s7, s11 -; CHECKFP-NEXT: vadd.f32 q0, q3, q1 +; CHECKFP-NEXT: vmov.f32 s14, s8 +; CHECKFP-NEXT: vmov.f32 s15, s11 +; CHECKFP-NEXT: vmov.f32 s16, s1 +; CHECKFP-NEXT: vmov.f32 s12, s2 +; CHECKFP-NEXT: vmov.f32 s17, s4 +; CHECKFP-NEXT: vmov.f32 s1, s3 +; CHECKFP-NEXT: vmov.f32 s18, s7 +; CHECKFP-NEXT: vmov.f32 s2, s6 +; CHECKFP-NEXT: vmov.f32 s19, s10 +; CHECKFP-NEXT: vmov.f32 s3, s9 +; CHECKFP-NEXT: vmov.f32 s13, s5 +; CHECKFP-NEXT: vadd.f32 q0, q0, q4 +; CHECKFP-NEXT: vadd.f32 q0, q0, q3 ; CHECKFP-NEXT: vpop {d8, d9} ; CHECKFP-NEXT: bx lr entry: @@ -1360,9 +1317,9 @@ define arm_aapcs_vfpcc <8 x half> @oneoff11_f16(<8 x half> %src1, <8 x half> %src2) { ; CHECK-LABEL: oneoff11_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmovx.f16 s1, s0 -; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> @@ -1372,12 +1329,8 @@ define arm_aapcs_vfpcc <8 x half> @oneoff12_f16(<8 x half> %src1, <8 x half> %src2) { ; CHECK-LABEL: oneoff12_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmov.f32 s5, s1 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[0], r0 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> @@ -1387,7 +1340,8 @@ define arm_aapcs_vfpcc <8 x half> @oneoff21_f16(<8 x half> %src1, <8 x half> %src2) { ; CHECK-LABEL: oneoff21_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q1[3], r0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -1399,9 +1353,8 @@ ; CHECK-LABEL: oneoff22_f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov.f32 s0, s3 -; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov.16 q0[0], r0 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> @@ -1446,8 +1399,6 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) { ; CHECKFP-LABEL: shuffle3step_f16: ; CHECKFP: @ %bb.0: @ %entry -; CHECKFP-NEXT: .save {r4, lr} -; CHECKFP-NEXT: push {r4, lr} ; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECKFP-NEXT: vmovx.f16 s16, s2 @@ -1475,30 +1426,19 @@ ; CHECKFP-NEXT: vins.f16 s24, s2 ; CHECKFP-NEXT: vmov.f32 s18, s8 ; CHECKFP-NEXT: vmovx.f16 s25, s3 -; CHECKFP-NEXT: vmovx.f16 s3, s9 -; CHECKFP-NEXT: vins.f16 s3, s11 -; CHECKFP-NEXT: vins.f16 s25, s5 -; CHECKFP-NEXT: vmov r3, s3 ; CHECKFP-NEXT: vmovx.f16 s0, s10 +; CHECKFP-NEXT: vins.f16 s25, s5 +; CHECKFP-NEXT: vmov.f32 s15, s19 +; CHECKFP-NEXT: vmovx.f16 s27, s9 ; CHECKFP-NEXT: vins.f16 s9, s0 -; CHECKFP-NEXT: vmovx.f16 s2, s6 -; CHECKFP-NEXT: vins.f16 s2, s8 -; CHECKFP-NEXT: vmov r4, s24 -; CHECKFP-NEXT: vmov r0, s2 -; CHECKFP-NEXT: vmov r12, s14 -; CHECKFP-NEXT: vmov q1[2], q1[0], r4, r0 -; CHECKFP-NEXT: vmov lr, s25 -; CHECKFP-NEXT: vmov r1, s12 -; CHECKFP-NEXT: vmov q1[3], q1[1], lr, r3 +; CHECKFP-NEXT: vins.f16 s27, s11 ; CHECKFP-NEXT: vmov.f32 s23, s9 -; CHECKFP-NEXT: vmov q0[2], q0[0], r1, r12 -; CHECKFP-NEXT: vmov r1, s13 -; CHECKFP-NEXT: vadd.f16 q1, q5, q1 -; CHECKFP-NEXT: vmov r2, s19 -; CHECKFP-NEXT: vmov q0[3], q0[1], r1, r2 -; CHECKFP-NEXT: vadd.f16 q0, q1, q0 +; CHECKFP-NEXT: vmovx.f16 s26, s6 +; CHECKFP-NEXT: vins.f16 s26, s8 +; CHECKFP-NEXT: vadd.f16 q0, q5, q6 +; CHECKFP-NEXT: vadd.f16 q0, q0, q3 ; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECKFP-NEXT: pop {r4, pc} +; CHECKFP-NEXT: bx lr entry: %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> %s2 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> @@ -1681,20 +1621,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: adr r1, .LCPI73_0 +; CHECK-NEXT: adr r2, .LCPI76_0 ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: mov r1, sp ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: vmov.f32 s1, s5 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vstrh.32 q0, [r2] +; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: ldrd r0, r1, [sp], #8 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI73_0: +; CHECK-NEXT: .LCPI76_0: ; CHECK-NEXT: .zero 4 ; CHECK-NEXT: .long 7 @ 0x7 ; CHECK-NEXT: .long 1 @ 0x1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -34,31 +34,26 @@ define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vld3_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.f64 d8, d2 -; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vdup.32 q5, r0 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.f32 s10, s8 -; CHECK-NEXT: vadd.i32 q3, q4, q3 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vadd.i32 q0, q3, q0 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.f64 d3, d0 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.f32 s9, s11 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vadd.i32 q0, q2, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <12 x i32>, <12 x i32>* %src, align 4 @@ -74,53 +69,43 @@ define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) { ; CHECK-LABEL: vld3_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vdup.32 q4, r2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.f64 d8, d2 -; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vdup.32 q5, r2 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.f32 s10, s8 -; CHECK-NEXT: vadd.i32 q3, q4, q3 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vadd.i32 q0, q3, q0 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vmov.f64 d3, d0 +; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vdup.32 q5, r0 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.f64 d10, d4 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vdup.32 q6, r0 ; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.f32 s14, s12 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vadd.i32 q1, q4, q1 +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.f32 s9, s11 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vadd.i32 q0, q2, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov.f32 s20, s13 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s21, s16 +; CHECK-NEXT: vmov.f32 s13, s15 +; CHECK-NEXT: vmov.f32 s22, s19 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f32 s23, s6 +; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov.f32 s9, s17 +; CHECK-NEXT: vadd.i32 q1, q3, q5 +; CHECK-NEXT: vadd.i32 q1, q1, q2 ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x i32>, <24 x i32>* %src, align 4 @@ -138,99 +123,74 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vdup.32 q4, r2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.f64 d8, d2 -; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vdup.32 q5, r2 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.f32 s10, s8 -; CHECK-NEXT: vadd.i32 q3, q4, q3 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vadd.i32 q0, q3, q0 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vmov.f64 d3, d0 +; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vldrw.u32 q0, [r0, #128] -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vdup.32 q5, r2 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.f64 d10, d4 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vdup.32 q6, r2 ; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.f32 s14, s12 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vadd.i32 q1, q4, q1 -; CHECK-NEXT: vldrw.u32 q4, [r0, #176] +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.f32 s9, s11 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vadd.i32 q0, q2, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vdup.32 q6, r2 -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.f64 d12, d6 -; CHECK-NEXT: vmov.f32 s25, s15 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov.f32 s26, s10 -; CHECK-NEXT: vdup.32 q7, r2 ; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s27, s31 -; CHECK-NEXT: vmov.f32 s18, s16 -; CHECK-NEXT: vadd.i32 q5, q6, q5 -; CHECK-NEXT: vmov.f32 s10, s16 -; CHECK-NEXT: vmov.f32 s11, s19 -; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vadd.i32 q2, q5, q2 -; CHECK-NEXT: vldrw.u32 q5, [r0, #112] +; CHECK-NEXT: vmov.f32 s21, s16 +; CHECK-NEXT: vmov.f32 s13, s15 +; CHECK-NEXT: vmov.f32 s22, s19 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f32 s23, s6 +; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov.f32 s9, s17 +; CHECK-NEXT: vadd.i32 q1, q3, q5 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #176] +; CHECK-NEXT: vldrw.u32 q4, [r0, #144] +; CHECK-NEXT: vldrw.u32 q5, [r0, #160] +; CHECK-NEXT: vmov.f64 d7, d4 +; CHECK-NEXT: vmov.f32 s15, s11 ; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s12, s18 ; CHECK-NEXT: vmov.f32 s25, s20 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.f32 s13, s19 -; CHECK-NEXT: vdup.32 q7, r0 +; CHECK-NEXT: vmov.f32 s17, s19 ; CHECK-NEXT: vmov.f32 s26, s23 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.f32 s27, s31 -; CHECK-NEXT: vdup.32 q7, r0 -; CHECK-NEXT: vmov.f32 s14, s22 -; CHECK-NEXT: vmov.f32 s20, s18 -; CHECK-NEXT: vmov.f32 s15, s31 -; CHECK-NEXT: vmov.f32 s2, s0 -; CHECK-NEXT: vadd.i32 q6, q3, q6 -; CHECK-NEXT: vmov.f32 s22, s0 -; CHECK-NEXT: vmov.f32 s23, s3 -; CHECK-NEXT: vadd.i32 q0, q6, q5 -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov.f32 s27, s10 +; CHECK-NEXT: vmov.f32 s19, s9 +; CHECK-NEXT: vmov.f32 s13, s21 +; CHECK-NEXT: vadd.i32 q2, q4, q6 +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vldrw.u32 q3, [r0, #128] +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vldrw.u32 q6, [r0, #112] +; CHECK-NEXT: vmov.f64 d9, d6 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s19, s15 +; CHECK-NEXT: vmov.f32 s28, s21 +; CHECK-NEXT: vmov.f32 s16, s22 +; CHECK-NEXT: vmov.f32 s29, s24 +; CHECK-NEXT: vmov.f32 s21, s23 +; CHECK-NEXT: vmov.f32 s30, s27 +; CHECK-NEXT: vmov.f32 s22, s26 +; CHECK-NEXT: vmov.f32 s31, s14 +; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vmov.f32 s17, s25 +; CHECK-NEXT: vadd.i32 q3, q5, q7 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -328,71 +288,63 @@ define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld3_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] ; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r0, q1[7] ; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s0 -; CHECK-NEXT: vmov q3, q5 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmovnb.i32 q3, q4 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmov.u16 r0, q3[0] ; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.f32 s22, s4 ; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmovnb.i32 q6, q0 ; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.f32 s2, s26 ; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.f32 s3, s23 ; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.u16 r0, q1[5] ; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r0, q2[7] ; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.u16 r0, q2[5] ; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmovx.f16 s20, s8 -; CHECK-NEXT: vins.f16 s20, s10 -; CHECK-NEXT: vmovx.f16 s21, s11 -; CHECK-NEXT: vins.f16 s21, s5 -; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmovx.f16 s7, s1 +; CHECK-NEXT: vmovx.f16 s20, s12 +; CHECK-NEXT: vins.f16 s20, s14 +; CHECK-NEXT: vmovx.f16 s21, s15 +; CHECK-NEXT: vins.f16 s21, s9 +; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vmovx.f16 s11, s5 ; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vins.f16 s7, s3 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmovnb.i32 q0, q5 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 -; CHECK-NEXT: vadd.i16 q0, q4, q0 -; CHECK-NEXT: vadd.i16 q0, q0, q3 +; CHECK-NEXT: vins.f16 s11, s7 +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmovnb.i32 q1, q5 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s23, s11 +; CHECK-NEXT: vadd.i16 q1, q4, q5 +; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x i16>, <24 x i16>* %src, align 4 @@ -410,130 +362,119 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] ; CHECK-NEXT: vldrw.u32 q3, [r0, #80] ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov.16 q0[4], r2 ; CHECK-NEXT: vmov.u16 r2, q3[2] ; CHECK-NEXT: vmov.16 q4[6], r2 ; CHECK-NEXT: vmov.u16 r2, q3[5] ; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.16 q0[5], r2 ; CHECK-NEXT: vmov.u16 r2, q3[4] ; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vmov.16 q4[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.16 q6[3], r2 +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.16 q4[3], r2 ; CHECK-NEXT: vmov.u16 r2, q3[7] ; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.f32 s26, s3 +; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s12 -; CHECK-NEXT: vmov.f32 s7, s19 -; CHECK-NEXT: vmov q4, q5 -; CHECK-NEXT: vmovnb.i32 q4, q6 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 -; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmov.u16 r2, q1[5] +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmovnb.i32 q6, q4 +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.f32 s19, s23 ; CHECK-NEXT: vins.f16 s22, s12 -; CHECK-NEXT: vmov r3, s25 ; CHECK-NEXT: vmovx.f16 s23, s13 -; CHECK-NEXT: vmov q4[3], q4[1], r3, r2 ; CHECK-NEXT: vins.f16 s23, s15 ; CHECK-NEXT: vmovx.f16 s12, s8 ; CHECK-NEXT: vins.f16 s12, s10 -; CHECK-NEXT: vmov.u16 r2, q0[5] ; CHECK-NEXT: vmovx.f16 s13, s11 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vins.f16 s13, s1 -; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vins.f16 s13, s5 +; CHECK-NEXT: vmov q1, q5 ; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmovnb.i32 q0, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vmov r3, s13 +; CHECK-NEXT: vmovnb.i32 q1, q3 +; CHECK-NEXT: vmov.f32 s14, s6 +; CHECK-NEXT: vmov.f32 s15, s23 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vadd.i16 q0, q0, q3 +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q3[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vadd.i16 q0, q0, q4 +; CHECK-NEXT: vmov.u16 r2, q3[2] ; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.u16 r2, q3[5] +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[0] ; CHECK-NEXT: vmov.u16 r0, q5[7] -; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.16 q1[2], r2 ; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov.f32 s26, s20 -; CHECK-NEXT: vadd.i16 q0, q0, q4 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmov q4, q6 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmovnb.i32 q4, q3 -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.16 q4[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov.u16 r2, q5[2] -; CHECK-NEXT: vmov.16 q7[6], r2 -; CHECK-NEXT: vmov.u16 r2, q5[5] -; CHECK-NEXT: vmov.16 q7[7], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vins.f16 s14, s20 -; CHECK-NEXT: vmov q6[3], q6[1], r0, r3 -; CHECK-NEXT: vmovx.f16 s15, s21 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vins.f16 s15, s23 -; CHECK-NEXT: vmovx.f16 s20, s8 -; CHECK-NEXT: vins.f16 s20, s10 -; CHECK-NEXT: vmov.f32 s19, s31 -; CHECK-NEXT: vmovx.f16 s21, s11 -; CHECK-NEXT: vins.f16 s21, s5 -; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov.f32 s26, s20 +; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmovnb.i32 q7, q1 +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q3[6] +; CHECK-NEXT: vmov.f32 s6, s30 +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.f32 s7, s27 +; CHECK-NEXT: vins.f16 s26, s20 +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmovx.f16 s27, s21 +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.u16 r0, q5[2] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u16 r0, q5[5] +; CHECK-NEXT: vins.f16 s27, s23 +; CHECK-NEXT: vmovx.f16 s20, s12 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vins.f16 s20, s14 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmovx.f16 s21, s15 +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vins.f16 s21, s9 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmovnb.i32 q1, q5 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 -; CHECK-NEXT: vadd.i16 q1, q4, q1 -; CHECK-NEXT: vadd.i16 q1, q1, q6 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vmovnb.i32 q0, q5 +; CHECK-NEXT: vmov.f32 s22, s2 +; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vadd.i16 q0, q4, q5 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -697,129 +638,119 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) { ; CHECK-LABEL: vld3_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.u8 r2, q1[1] ; CHECK-NEXT: vmov.8 q3[0], r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: vmov.u8 r2, q1[4] ; CHECK-NEXT: vmov.8 q3[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: vmov.u8 r2, q1[7] ; CHECK-NEXT: vmov.8 q3[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: vmov.u8 r2, q1[10] ; CHECK-NEXT: vmov.8 q3[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov.u8 r2, q1[13] ; CHECK-NEXT: vmov.8 q3[4], r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: vmov.8 q3[5], r2 -; CHECK-NEXT: vmov.u8 r2, q2[2] +; CHECK-NEXT: vmov.u8 r2, q0[3] ; CHECK-NEXT: vmov.8 q3[6], r2 -; CHECK-NEXT: vmov.u8 r2, q2[8] -; CHECK-NEXT: vmov.8 q4[8], r2 -; CHECK-NEXT: vmov.u8 r2, q2[11] -; CHECK-NEXT: vmov.8 q4[9], r2 -; CHECK-NEXT: vmov.u8 r2, q2[14] -; CHECK-NEXT: vmov.8 q4[10], r2 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q4[11], r0 +; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: vmov.8 q3[7], r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] ; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q3[7], r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.8 q5[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.8 q3[8], r2 +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov.8 q4[12], r0 +; CHECK-NEXT: vmov.u8 r0, q2[8] +; CHECK-NEXT: vmov.8 q3[9], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.8 q4[13], r0 +; CHECK-NEXT: vmov.u8 r0, q2[11] +; CHECK-NEXT: vmov.8 q3[10], r2 +; CHECK-NEXT: vmov.8 q4[14], r0 +; CHECK-NEXT: vmov.u8 r0, q2[14] +; CHECK-NEXT: vmov.8 q4[15], r0 +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vmov.8 q5[11], r0 -; CHECK-NEXT: vmov.u8 r2, q0[1] -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.8 q5[0], r2 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: vmov.8 q5[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: vmov.8 q5[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: vmov.8 q5[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: vmov.8 q5[4], r2 -; CHECK-NEXT: vmov.u8 r2, q2[0] -; CHECK-NEXT: vmov.8 q5[5], r2 -; CHECK-NEXT: vmov.u8 r2, q2[3] -; CHECK-NEXT: vmov.8 q5[6], r2 -; CHECK-NEXT: vmov.u8 r2, q2[6] -; CHECK-NEXT: vmov.8 q5[7], r2 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q7[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q7[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q7[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q7[15], r0 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov r0, s31 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q5[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q5[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q5[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vadd.i8 q3, q4, q6 -; CHECK-NEXT: vmov.8 q4[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: vmov.8 q4[9], r0 ; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q4[10], r0 +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q4[11], r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.8 q4[0], r2 -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: vmov.8 q4[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[8] -; CHECK-NEXT: vmov.8 q4[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: vmov.8 q4[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: vmov.8 q4[4], r2 -; CHECK-NEXT: vmov.u8 r2, q2[1] -; CHECK-NEXT: vmov.8 q4[5], r2 -; CHECK-NEXT: vmov.u8 r2, q2[4] -; CHECK-NEXT: vmov.8 q4[6], r2 -; CHECK-NEXT: vmov.u8 r2, q2[7] -; CHECK-NEXT: vmov.8 q4[7], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov.8 q4[1], r0 ; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q2[12], r0 +; CHECK-NEXT: vmov.8 q4[2], r0 ; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q2[13], r0 +; CHECK-NEXT: vmov.8 q4[3], r0 ; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q2[14], r0 +; CHECK-NEXT: vmov.8 q4[4], r0 ; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q2[15], r0 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 -; CHECK-NEXT: vadd.i8 q0, q3, q0 +; CHECK-NEXT: vmov.8 q4[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.8 q4[7], r0 +; CHECK-NEXT: vmov.u8 r0, q0[8] +; CHECK-NEXT: vmov.8 q4[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.8 q4[9], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.8 q4[10], r0 +; CHECK-NEXT: vmov.u8 r0, q2[4] +; CHECK-NEXT: vmov.8 q5[12], r0 +; CHECK-NEXT: vmov.u8 r0, q2[7] +; CHECK-NEXT: vmov.8 q5[13], r0 +; CHECK-NEXT: vmov.u8 r0, q2[10] +; CHECK-NEXT: vmov.8 q5[14], r0 +; CHECK-NEXT: vmov.u8 r0, q2[13] +; CHECK-NEXT: vmov.8 q5[15], r0 +; CHECK-NEXT: vmov.u8 r0, q2[1] +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vmov.8 q6[11], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vadd.i8 q3, q4, q3 +; CHECK-NEXT: vmov.8 q4[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.8 q4[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.8 q4[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.8 q4[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.8 q4[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.u8 r0, q2[6] +; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov.u8 r0, q2[9] +; CHECK-NEXT: vmov.8 q1[13], r0 +; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.u8 r0, q2[15] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q5[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q5[9], r0 +; CHECK-NEXT: vmov.u8 r0, q2[0] +; CHECK-NEXT: vmov.8 q5[10], r0 +; CHECK-NEXT: vmov.u8 r0, q2[3] +; CHECK-NEXT: vmov.8 q5[11], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.8 q4[7], r0 +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov.f32 s19, s7 +; CHECK-NEXT: vadd.i8 q0, q3, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <48 x i8>, <48 x i8>* %src, align 4 @@ -1019,23 +950,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f64 d8, d2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s15, s10 -; CHECK-NEXT: vmov.f32 s19, s9 -; CHECK-NEXT: vmov.f32 s10, s8 -; CHECK-NEXT: vadd.f32 q3, q4, q3 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vadd.f32 q0, q3, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.f64 d3, d0 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.f32 s9, s11 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vadd.f32 q0, q2, q4 +; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -1055,41 +985,39 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f64 d8, d2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s15, s10 -; CHECK-NEXT: vmov.f32 s19, s9 -; CHECK-NEXT: vmov.f32 s10, s8 -; CHECK-NEXT: vadd.f32 q3, q4, q3 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vadd.f32 q0, q3, q0 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vmov.f64 d3, d0 +; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f64 d10, d4 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s6 ; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmov.f32 s14, s12 -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vadd.f32 q1, q4, q1 +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.f32 s9, s11 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vadd.f32 q0, q2, q4 +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov.f32 s20, s13 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s21, s16 +; CHECK-NEXT: vmov.f32 s13, s15 +; CHECK-NEXT: vmov.f32 s22, s19 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f32 s23, s6 +; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov.f32 s9, s17 +; CHECK-NEXT: vadd.f32 q1, q3, q5 +; CHECK-NEXT: vadd.f32 q1, q1, q2 ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -1109,77 +1037,73 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f64 d8, d2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s15, s10 -; CHECK-NEXT: vmov.f32 s19, s9 -; CHECK-NEXT: vmov.f32 s10, s8 -; CHECK-NEXT: vadd.f32 q3, q4, q3 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vadd.f32 q0, q3, q0 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vmov.f64 d3, d0 +; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f64 d10, d4 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s6 ; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmov.f32 s14, s12 -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vadd.f32 q1, q4, q1 -; CHECK-NEXT: vldrw.u32 q4, [r0, #176] +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.f32 s9, s11 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vadd.f32 q0, q2, q4 +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f64 d12, d6 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s25, s15 -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s26, s10 ; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov.f32 s23, s18 -; CHECK-NEXT: vmov.f32 s27, s17 -; CHECK-NEXT: vmov.f32 s18, s16 -; CHECK-NEXT: vadd.f32 q5, q6, q5 -; CHECK-NEXT: vmov.f32 s10, s16 -; CHECK-NEXT: vmov.f32 s11, s19 -; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vadd.f32 q2, q5, q2 -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-NEXT: vmov.f32 s21, s16 +; CHECK-NEXT: vmov.f32 s13, s15 +; CHECK-NEXT: vmov.f32 s22, s19 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f32 s23, s6 +; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov.f32 s9, s17 +; CHECK-NEXT: vadd.f32 q1, q3, q5 +; CHECK-NEXT: vadd.f32 q1, q1, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #176] +; CHECK-NEXT: vldrw.u32 q4, [r0, #144] +; CHECK-NEXT: vldrw.u32 q5, [r0, #160] +; CHECK-NEXT: vmov.f64 d7, d4 +; CHECK-NEXT: vmov.f32 s15, s11 ; CHECK-NEXT: vmov.f32 s24, s17 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov.f32 s25, s20 +; CHECK-NEXT: vmov.f32 s17, s19 +; CHECK-NEXT: vmov.f32 s26, s23 +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov.f32 s27, s10 +; CHECK-NEXT: vmov.f32 s19, s9 +; CHECK-NEXT: vmov.f32 s13, s21 +; CHECK-NEXT: vadd.f32 q2, q4, q6 +; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vldrw.u32 q3, [r0, #128] +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vldrw.u32 q6, [r0, #112] +; CHECK-NEXT: vmov.f64 d9, d6 ; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f64 d14, d8 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s25, s12 -; CHECK-NEXT: vmov.f32 s29, s19 -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vmov.f32 s30, s14 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s27, s22 -; CHECK-NEXT: vmov.f32 s31, s21 -; CHECK-NEXT: vmov.f32 s22, s20 -; CHECK-NEXT: vadd.f32 q6, q7, q6 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vadd.f32 q3, q6, q3 +; CHECK-NEXT: vmov.f32 s19, s15 +; CHECK-NEXT: vmov.f32 s28, s21 +; CHECK-NEXT: vmov.f32 s16, s22 +; CHECK-NEXT: vmov.f32 s29, s24 +; CHECK-NEXT: vmov.f32 s21, s23 +; CHECK-NEXT: vmov.f32 s30, s27 +; CHECK-NEXT: vmov.f32 s22, s26 +; CHECK-NEXT: vmov.f32 s31, s14 +; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vmov.f32 s17, s25 +; CHECK-NEXT: vadd.f32 q3, q5, q7 +; CHECK-NEXT: vadd.f32 q3, q3, q4 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -1274,8 +1198,6 @@ define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrw.u32 q3, [r0] @@ -1305,32 +1227,21 @@ ; CHECK-NEXT: vins.f16 s22, s24 ; CHECK-NEXT: vmovx.f16 s24, s12 ; CHECK-NEXT: vins.f16 s24, s14 -; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmov.f32 s3, s11 ; CHECK-NEXT: vmovx.f16 s25, s15 -; CHECK-NEXT: vmovx.f16 s15, s17 -; CHECK-NEXT: vins.f16 s15, s19 -; CHECK-NEXT: vins.f16 s25, s5 -; CHECK-NEXT: vmov r0, s15 ; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vins.f16 s25, s5 +; CHECK-NEXT: vmovx.f16 s27, s17 ; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vins.f16 s6, s16 -; CHECK-NEXT: vmov r5, s24 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r12 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov lr, s25 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: vmov q0[3], q0[1], lr, r0 +; CHECK-NEXT: vins.f16 s27, s19 ; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r3 -; CHECK-NEXT: vadd.f16 q0, q5, q0 -; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vmovx.f16 s26, s6 +; CHECK-NEXT: vins.f16 s26, s16 +; CHECK-NEXT: vadd.f16 q1, q5, q6 +; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: bx lr entry: %l1 = load <24 x half>, <24 x half>* %src, align 4 %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> @@ -1345,8 +1256,6 @@ define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld3_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrw.u32 q3, [r0, #48] @@ -1376,46 +1285,36 @@ ; CHECK-NEXT: vins.f16 s22, s24 ; CHECK-NEXT: vmovx.f16 s24, s12 ; CHECK-NEXT: vins.f16 s24, s14 -; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmov.f32 s3, s11 ; CHECK-NEXT: vmovx.f16 s25, s15 -; CHECK-NEXT: vmovx.f16 s15, s17 +; CHECK-NEXT: vmovx.f16 s12, s18 ; CHECK-NEXT: vins.f16 s25, s5 -; CHECK-NEXT: vmov r6, s24 -; CHECK-NEXT: vmov lr, s25 -; CHECK-NEXT: vmovx.f16 s24, s18 -; CHECK-NEXT: vmovx.f16 s6, s6 -; CHECK-NEXT: vins.f16 s17, s24 -; CHECK-NEXT: vins.f16 s6, s16 -; CHECK-NEXT: vins.f16 s15, s19 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r12 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r4, s15 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r3 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov q0[3], q0[1], lr, r4 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmovx.f16 s27, s17 +; CHECK-NEXT: vins.f16 s17, s12 +; CHECK-NEXT: vins.f16 s27, s19 ; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vmovx.f16 s26, s6 +; CHECK-NEXT: vins.f16 s26, s16 ; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vmov q1[3], q1[1], r5, r2 -; CHECK-NEXT: vadd.f16 q0, q5, q0 -; CHECK-NEXT: vadd.f16 q1, q0, q1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vadd.f16 q1, q5, q6 +; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vmov.f32 s4, s17 -; CHECK-NEXT: vins.f16 s4, s12 +; CHECK-NEXT: vmov.f32 s0, s17 +; CHECK-NEXT: vmovx.f16 s20, s7 +; CHECK-NEXT: vins.f16 s0, s12 ; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmov.f32 s5, s8 -; CHECK-NEXT: vmovx.f16 s20, s3 -; CHECK-NEXT: vins.f16 s5, s12 -; CHECK-NEXT: vmov.f32 s15, s2 -; CHECK-NEXT: vins.f16 s15, s20 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vmovx.f16 s20, s0 +; CHECK-NEXT: vmov.f32 s1, s8 ; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vins.f16 s6, s20 +; CHECK-NEXT: vins.f16 s1, s12 +; CHECK-NEXT: vmov.f32 s15, s6 +; CHECK-NEXT: vins.f16 s15, s20 +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vmovx.f16 s20, s4 +; CHECK-NEXT: vmov.f32 s14, s4 +; CHECK-NEXT: vins.f16 s2, s20 ; CHECK-NEXT: vmov.f64 d10, d8 ; CHECK-NEXT: vins.f16 s20, s24 ; CHECK-NEXT: vmovx.f16 s24, s8 @@ -1426,33 +1325,21 @@ ; CHECK-NEXT: vins.f16 s22, s24 ; CHECK-NEXT: vmovx.f16 s24, s16 ; CHECK-NEXT: vins.f16 s24, s18 -; CHECK-NEXT: vmov.f32 s14, s0 +; CHECK-NEXT: vmov.f32 s3, s15 ; CHECK-NEXT: vmovx.f16 s25, s19 -; CHECK-NEXT: vmovx.f16 s19, s1 +; CHECK-NEXT: vmovx.f16 s16, s6 ; CHECK-NEXT: vins.f16 s25, s9 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmovx.f16 s24, s2 -; CHECK-NEXT: vmovx.f16 s10, s10 -; CHECK-NEXT: vins.f16 s1, s24 -; CHECK-NEXT: vins.f16 s10, s0 -; CHECK-NEXT: vins.f16 s19, s3 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r6, s10 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov.f32 s23, s1 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r6 -; CHECK-NEXT: vmov r5, s19 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r5 -; CHECK-NEXT: vmov r4, s15 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r4 -; CHECK-NEXT: vadd.f16 q1, q5, q1 +; CHECK-NEXT: vmovx.f16 s27, s5 +; CHECK-NEXT: vins.f16 s5, s16 +; CHECK-NEXT: vins.f16 s27, s7 +; CHECK-NEXT: vmov.f32 s23, s5 +; CHECK-NEXT: vmovx.f16 s26, s10 +; CHECK-NEXT: vins.f16 s26, s4 +; CHECK-NEXT: vadd.f16 q1, q5, q6 ; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: bx lr entry: %l1 = load <48 x half>, <48 x half>* %src, align 4 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -42,30 +42,26 @@ define void @vst3_v4i32(<4 x i32> *%src, <12 x i32> *%dst) { ; CHECK-LABEL: vst3_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.f32 s5, s1 -; CHECK-NEXT: vdup.32 q5, r0 -; CHECK-NEXT: vmov.f32 s13, s8 +; CHECK-NEXT: vmov.f64 d8, d6 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vmov.f32 s19, s13 +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.f32 s18, s0 ; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vdup.32 q2, r0 -; CHECK-NEXT: vmov.f32 s15, s17 -; CHECK-NEXT: vmov.f32 s1, s19 -; CHECK-NEXT: vmov.f32 s6, s18 -; CHECK-NEXT: vmov.f32 s14, s22 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vmov.f32 s11, s6 +; CHECK-NEXT: vmov.f32 s1, s15 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vstrw.32 q2, [r1, #16] ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 @@ -86,57 +82,45 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vmov.f64 d6, d1 -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s13, s23 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov.f32 s15, s3 -; CHECK-NEXT: vdup.32 q2, r2 -; CHECK-NEXT: vmov.f32 s14, s10 -; CHECK-NEXT: vmov.f64 d4, d8 -; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f32 s9, s24 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vmov q4, q5 -; CHECK-NEXT: vmov.f32 s21, s4 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vmov.f32 s10, s18 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vstrw.32 q5, [r1, #48] -; CHECK-NEXT: vmov.f32 s4, s17 +; CHECK-NEXT: vldrw.u32 q7, [r0, #32] +; CHECK-NEXT: vldrw.u32 q6, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vmov.f64 d10, d8 +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] +; CHECK-NEXT: vmov.f32 s21, s28 +; CHECK-NEXT: vmov.f64 d14, d12 +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s29, s12 +; CHECK-NEXT: vmov.f32 s9, s27 +; CHECK-NEXT: vmov.f32 s31, s25 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s30, s0 +; CHECK-NEXT: vmov.f32 s0, s13 +; CHECK-NEXT: vstrw.32 q7, [r1, #48] +; CHECK-NEXT: vmov.f32 s3, s14 +; CHECK-NEXT: vmov.f32 s2, s26 +; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s10, s15 ; CHECK-NEXT: vstrw.32 q0, [r1, #64] -; CHECK-NEXT: vmov.f32 s5, s29 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmov.f32 s28, s30 -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.f32 s7, s18 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s29, s27 -; CHECK-NEXT: vmov.f32 s6, s26 -; CHECK-NEXT: vmov.f32 s30, s18 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vstrw.32 q7, [r1, #32] -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vstrw.32 q2, [r1, #80] +; CHECK-NEXT: vmov.f32 s12, s25 +; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vmov.f32 s15, s26 +; CHECK-NEXT: vmov.f32 s5, s19 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f32 s6, s27 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -160,120 +144,104 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #160 ; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q1, [r0, #144] +; CHECK-NEXT: vldrw.u32 q7, [r0, #96] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q3, [r0, #128] -; CHECK-NEXT: vldrw.u32 q5, [r0] -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vldrw.u32 q4, [r0, #176] -; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s9, s13 -; CHECK-NEXT: vldrw.u32 q6, [r0, #112] -; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r0, #160] -; CHECK-NEXT: vmov.f32 s11, s2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #80] +; CHECK-NEXT: vmov.f32 s16, s1 +; CHECK-NEXT: vldrw.u32 q3, [r0, #160] +; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q7, [r0, #48] -; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vmov.f32 s10, s22 -; CHECK-NEXT: vstrw.32 q1, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d2, d7 -; CHECK-NEXT: vdup.32 q2, r0 -; CHECK-NEXT: vmov.f32 s5, s23 -; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vmov.f32 s17, s9 +; CHECK-NEXT: vstrw.32 q3, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #32] +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vldrw.u32 q5, [r0, #144] +; CHECK-NEXT: vldrw.u32 q1, [r0, #176] +; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f64 d8, d5 +; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s17, s27 +; CHECK-NEXT: vmov.f32 s19, s11 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vmov.f64 d8, d3 +; CHECK-NEXT: vmov.f32 s17, s31 +; CHECK-NEXT: vmov.f32 s19, s7 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d8, d12 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s19, s25 +; CHECK-NEXT: vmov.f32 s18, s8 +; CHECK-NEXT: vmov q2, q7 +; CHECK-NEXT: vmov.f64 d0, d4 +; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s1, s12 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s4, s13 +; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s7, s14 +; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vstrw.32 q1, [r1, #32] -; CHECK-NEXT: vmov.f64 d2, d9 -; CHECK-NEXT: vmov.f32 s5, s31 -; CHECK-NEXT: vmov.f32 s7, s19 -; CHECK-NEXT: vmov r0, s27 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f64 d2, d10 -; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s5, s0 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s7, s21 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f64 d0, d14 -; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s24 -; CHECK-NEXT: vmov.f32 s3, s29 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s16, s25 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s19, s26 -; CHECK-NEXT: vldrw.u32 q6, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s30 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov.f32 s28, s5 -; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s29, s1 -; CHECK-NEXT: vmov.f32 s31, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d8, d1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.f32 s19, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f64 d12, d11 +; CHECK-NEXT: vmov q7, q1 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov q0, q7 +; CHECK-NEXT: vmov.f32 s25, s7 +; CHECK-NEXT: vstrw.32 q4, [r1, #112] +; CHECK-NEXT: vmov.f32 s27, s23 +; CHECK-NEXT: vldrw.u32 q5, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vldrw.u32 q3, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s29, s20 ; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vmov.f32 s30, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d6, d5 -; CHECK-NEXT: vstrw.32 q7, [r1, #112] -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s19, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov r0, s27 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s15, s11 -; CHECK-NEXT: vstrw.32 q4, [r1, #128] -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vmov.f32 s21, s4 -; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f32 s23, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.f32 s22, s2 +; CHECK-NEXT: vmov.f32 s31, s1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [r1, #96] -; CHECK-NEXT: vmov.f64 d2, d0 -; CHECK-NEXT: vmov.f32 s5, s24 -; CHECK-NEXT: vmov q6, q0 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s26 +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vstrw.32 q2, [r1, #128] +; CHECK-NEXT: vmov.f32 s30, s0 +; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: vstrw.32 q7, [r1, #96] +; CHECK-NEXT: vmov.f32 s1, s12 +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vstrw.32 q0, [r1, #48] +; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vstrw.32 q0, [r1, #144] -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1, #64] +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s15, s6 ; CHECK-NEXT: vstrw.32 q0, [r1, #160] -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s14, s22 ; CHECK-NEXT: vstrw.32 q0, [r1, #176] -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [r1, #64] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #160 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -311,9 +279,8 @@ ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 ; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vmov.f32 s3, s2 -; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.32 q0[2], r12 ; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: str r0, [r1, #8] ; CHECK-NEXT: pop {r4, pc} @@ -336,32 +303,30 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrh.u32 q2, [r0, #16] ; CHECK-NEXT: vldrh.u32 q1, [r0] ; CHECK-NEXT: vldrh.u32 q3, [r0, #8] -; CHECK-NEXT: vmov.f64 d0, d5 -; CHECK-NEXT: vmov.f32 s1, s7 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vldrh.u32 q2, [r0, #16] ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.f32 s2, s18 ; CHECK-NEXT: vmov.16 q4[0], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vstrh.32 q0, [r1, #16] ; CHECK-NEXT: vmov.16 q4[1], r0 ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov.16 q4[2], r0 ; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov.16 q4[3], r0 ; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.f64 d0, d5 ; CHECK-NEXT: vmov.16 q4[4], r0 ; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.f32 s1, s7 ; CHECK-NEXT: vmov.16 q4[5], r0 ; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.f32 s3, s11 ; CHECK-NEXT: vmov.16 q4[6], r0 ; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.f32 s2, s15 ; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vstrh.32 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q4, [r1] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -382,77 +347,77 @@ define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) { ; CHECK-LABEL: vst3_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f64 d8, d2 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.f64 d0, d4 +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.f32 s18, s12 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vins.f16 s3, s5 +; CHECK-NEXT: vmov.16 q5[2], r0 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov.u16 r0, q4[5] +; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vins.f16 s16, s8 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vins.f16 s5, s9 -; CHECK-NEXT: vmov.f32 s19, s5 -; CHECK-NEXT: vdup.32 q5, r2 -; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.f32 s1, s21 +; CHECK-NEXT: vins.f16 s17, s7 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.f32 s2, s22 +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov.f32 s18, s15 ; CHECK-NEXT: vmov.u16 r2, q5[2] -; CHECK-NEXT: vmov.16 q6[2], r2 ; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.16 q6[2], r2 ; CHECK-NEXT: vmov.16 q6[3], r0 ; CHECK-NEXT: vmov.u16 r0, q4[4] ; CHECK-NEXT: vmov.16 q6[4], r0 ; CHECK-NEXT: vmov.u16 r0, q5[5] ; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.u16 r0, q3[2] ; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.u16 r0, q2[3] ; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vins.f16 s21, s11 -; CHECK-NEXT: vmov.f32 s17, s25 +; CHECK-NEXT: vmov.u16 r0, q3[4] ; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov.u16 r0, q2[5] ; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vdup.32 q6, r2 -; CHECK-NEXT: vmov.f32 s22, s15 -; CHECK-NEXT: vmov.u16 r2, q6[2] +; CHECK-NEXT: vrev32.16 q1, q1 +; CHECK-NEXT: vmov.f32 s21, s13 +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vmov.16 q2[2], r2 ; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.16 q7[2], r2 -; CHECK-NEXT: vmov.f32 s1, s13 -; CHECK-NEXT: vmov.16 q7[3], r0 +; CHECK-NEXT: vmov.f32 s17, s25 +; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vrev32.16 q2, q2 -; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.16 q7[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.f32 s21, s29 +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.f32 s22, s30 -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.f32 s21, s9 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q5, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -473,175 +438,164 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #160 -; CHECK-NEXT: sub sp, #160 +; CHECK-NEXT: .pad #112 +; CHECK-NEXT: sub sp, #112 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vmov.f64 d4, d2 +; CHECK-NEXT: vldrw.u32 q7, [r0, #64] +; CHECK-NEXT: vmov.f64 d12, d2 ; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov q6, q0 -; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q6, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.f32 s11, s5 -; CHECK-NEXT: vins.f16 s11, s1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vdup.32 q0, r3 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[2], r3 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s24, s0 +; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vmov.f32 s27, s5 +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vins.f16 s27, s1 +; CHECK-NEXT: vmov.f32 s13, s4 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s14, s4 +; CHECK-NEXT: vmov.f32 s25, s8 +; CHECK-NEXT: vmov.u16 r3, q3[2] +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.u16 r2, q6[3] +; CHECK-NEXT: vmov.16 q3[2], r3 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov.u16 r2, q6[4] +; CHECK-NEXT: vmov.16 q3[4], r2 ; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vmov.u16 r2, q3[5] ; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov.u16 r2, q1[5] ; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vins.f16 s21, s15 -; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vins.f16 s21, s3 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.16 q5[6], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.f32 s10, s3 ; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.f32 s22, s3 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.u16 r3, q2[2] +; CHECK-NEXT: vmov.f32 s22, s7 +; CHECK-NEXT: vmov.16 q1[2], r3 ; CHECK-NEXT: vmov.u16 r2, q5[3] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov.u16 r2, q5[4] +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov.f32 s21, s5 +; CHECK-NEXT: vmov.f64 d8, d4 +; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s16, s0 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.16 q4[4], r2 +; CHECK-NEXT: vstrw.32 q5, [r1, #80] +; CHECK-NEXT: vmov.f32 s19, s9 +; CHECK-NEXT: vins.f16 s19, s1 +; CHECK-NEXT: vmov.f32 s17, s8 +; CHECK-NEXT: vmov.f32 s9, s28 +; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.f32 s10, s28 +; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov.u16 r0, q4[4] +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.f32 s17, s5 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vmov.16 q3[5], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.f64 d14, d0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s28, s12 -; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.f32 s31, s1 -; CHECK-NEXT: vins.f16 s31, s13 -; CHECK-NEXT: vrev32.16 q3, q3 -; CHECK-NEXT: vmov.f32 s29, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.u16 r0, q7[3] -; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov.u16 r0, q7[5] +; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vins.f16 s9, s3 +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.u16 r0, q7[7] +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vmov.f32 s10, s31 +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.f32 s25, s13 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s26, s14 +; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vrev32.16 q1, q1 +; CHECK-NEXT: vmov.u16 r0, q7[2] +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[0], r0 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.16 q1[1], r0 ; CHECK-NEXT: vmov.u16 r0, q7[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q6[5] +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s5, s29 +; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vstrw.32 q2, [r1, #32] +; CHECK-NEXT: vmov.u16 r2, q7[2] +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vstrw.32 q6, [r1, #48] +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.u16 r0, q7[5] +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vldrw.u32 q7, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [r1, #64] +; CHECK-NEXT: vmov.u16 r0, q3[2] ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.u16 r0, q7[3] ; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q6[7] -; CHECK-NEXT: vins.f16 s1, s27 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov.u16 r0, q3[4] ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.u16 r0, q7[5] ; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.f32 s2, s19 -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s30 +; CHECK-NEXT: vrev32.16 q3, q3 ; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vdup.32 q6, r2 -; CHECK-NEXT: vmov.u16 r2, q6[2] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.u16 r2, q3[2] +; CHECK-NEXT: vmov.16 q7[2], r2 +; CHECK-NEXT: vmov.16 q7[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vrev32.16 q6, q6 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vstrw.32 q6, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.f32 s29, s5 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.f32 s30, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #80] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s25, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s18 -; CHECK-NEXT: vstrw.32 q7, [r1] -; CHECK-NEXT: vmov.u16 r2, q6[3] -; CHECK-NEXT: vmov.f32 s13, s5 -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov.u16 r2, q6[4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.f32 s25, s1 -; CHECK-NEXT: vmov.f32 s26, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s6 -; CHECK-NEXT: vstrw.32 q6, [r1, #16] -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q3[3] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.f32 s13, s1 -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vstrw.32 q3, [r1, #64] -; CHECK-NEXT: add sp, #160 +; CHECK-NEXT: vmov.16 q7[4], r0 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.16 q7[5], r0 +; CHECK-NEXT: vmov.f32 s1, s29 +; CHECK-NEXT: vmov.f32 s2, s30 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: add sp, #112 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -774,11 +728,11 @@ ; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov.u16 r2, q1[7] ; CHECK-NEXT: vins.f16 s1, s7 -; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.f32 s17, s15 ; CHECK-NEXT: vmov.16 q0[6], r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vdup.32 q4, r0 +; CHECK-NEXT: vmov.f32 s18, s15 ; CHECK-NEXT: vmov.f32 s2, s11 ; CHECK-NEXT: vmov.u16 r0, q4[2] ; CHECK-NEXT: vmov.16 q5[2], r0 @@ -1180,22 +1134,22 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s8, s1 ; CHECK-NEXT: vmov.f32 s19, s13 -; CHECK-NEXT: vmov.f32 s9, s1 -; CHECK-NEXT: vmov.f32 s18, s0 -; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s18, s4 +; CHECK-NEXT: vmov.f32 s4, s6 ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vmov.f32 s1, s15 +; CHECK-NEXT: vmov.f32 s11, s2 +; CHECK-NEXT: vmov.f32 s5, s15 ; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vstrw.32 q0, [r1, #32] +; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -1220,40 +1174,40 @@ ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] +; CHECK-NEXT: vldrw.u32 q6, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vmov.f64 d10, d8 -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #48] +; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vmov.f32 s21, s28 -; CHECK-NEXT: vmov.f64 d14, d12 -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vmov.f32 s29, s12 -; CHECK-NEXT: vmov.f32 s9, s27 -; CHECK-NEXT: vmov.f32 s31, s25 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s30, s0 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vstrw.32 q7, [r1, #48] -; CHECK-NEXT: vmov.f32 s3, s14 -; CHECK-NEXT: vmov.f32 s2, s26 -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vstrw.32 q0, [r1, #64] +; CHECK-NEXT: vmov.f32 s21, s24 +; CHECK-NEXT: vmov.f64 d12, d4 +; CHECK-NEXT: vmov.f64 d6, d1 +; CHECK-NEXT: vmov.f32 s25, s28 +; CHECK-NEXT: vmov.f32 s13, s11 +; CHECK-NEXT: vmov.f32 s27, s9 +; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vmov.f32 s26, s0 +; CHECK-NEXT: vmov.f32 s0, s29 +; CHECK-NEXT: vstrw.32 q6, [r1, #48] +; CHECK-NEXT: vmov.f32 s3, s30 +; CHECK-NEXT: vmov.f32 s14, s31 +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vstrw.32 q2, [r1, #80] -; CHECK-NEXT: vmov.f32 s12, s25 -; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vstrw.32 q3, [r1, #80] +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.f32 s8, s29 +; CHECK-NEXT: vstrw.32 q0, [r1, #64] +; CHECK-NEXT: vmov.f32 s9, s5 ; CHECK-NEXT: vmov.f32 s22, s4 ; CHECK-NEXT: vmov.f32 s4, s6 ; CHECK-NEXT: vstrw.32 q5, [r1] -; CHECK-NEXT: vmov.f32 s15, s26 +; CHECK-NEXT: vmov.f32 s11, s30 ; CHECK-NEXT: vmov.f32 s5, s19 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s6, s27 -; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f32 s10, s18 +; CHECK-NEXT: vmov.f32 s6, s31 +; CHECK-NEXT: vstrw.32 q2, [r1, #16] ; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -1279,104 +1233,103 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #160 ; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q5, [r0, #96] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #128] +; CHECK-NEXT: vldrw.u32 q5, [r0, #112] +; CHECK-NEXT: vldrw.u32 q2, [r0, #128] ; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vmov.f32 s12, s1 +; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vmov.f32 s13, s9 +; CHECK-NEXT: vmov.f32 s15, s2 +; CHECK-NEXT: vldrw.u32 q4, [r0, #160] ; CHECK-NEXT: vstrw.32 q5, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vmov.f32 s16, s1 -; CHECK-NEXT: vldrw.u32 q3, [r0, #160] -; CHECK-NEXT: vstrw.32 q5, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #32] +; CHECK-NEXT: vmov.f32 s14, s26 +; CHECK-NEXT: vstrw.32 q4, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [r0, #144] +; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0, #176] ; CHECK-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-NEXT: vmov.f32 s17, s5 -; CHECK-NEXT: vstrw.32 q3, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vldrw.u32 q7, [r0, #144] -; CHECK-NEXT: vldrw.u32 q2, [r0, #176] -; CHECK-NEXT: vstrw.32 q5, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vstrw.32 q4, [r1, #16] -; CHECK-NEXT: vmov.f64 d8, d3 -; CHECK-NEXT: vstrw.32 q5, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s17, s27 -; CHECK-NEXT: vmov.f32 s19, s7 -; CHECK-NEXT: vmov.f32 s18, s3 -; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: vmov.f64 d8, d5 -; CHECK-NEXT: vmov.f32 s17, s23 -; CHECK-NEXT: vmov.f32 s19, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d8, d12 -; CHECK-NEXT: vmov.f32 s17, s0 -; CHECK-NEXT: vmov.f32 s19, s25 -; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vmov.f64 d0, d2 -; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s8, s13 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f64 d6, d5 +; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s13, s27 +; CHECK-NEXT: vmov.f32 s15, s11 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-NEXT: vmov.f64 d6, d3 +; CHECK-NEXT: vmov.f32 s13, s23 +; CHECK-NEXT: vmov.f32 s15, s7 +; CHECK-NEXT: vmov.f32 s14, s31 +; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d6, d12 +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s15, s25 +; CHECK-NEXT: vmov.f32 s14, s8 +; CHECK-NEXT: vmov q2, q7 +; CHECK-NEXT: vmov.f64 d0, d10 +; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f32 s3, s21 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vmov.f32 s6, s22 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d8, d1 -; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov.f32 s20, s5 ; CHECK-NEXT: vmov.f32 s21, s1 ; CHECK-NEXT: vmov.f32 s23, s6 ; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d6, d15 -; CHECK-NEXT: vmov q6, q1 -; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vmov.f64 d12, d9 +; CHECK-NEXT: vmov q7, q1 +; CHECK-NEXT: vmov.f32 s9, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s19, s3 -; CHECK-NEXT: vmov q0, q6 -; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov q0, q7 +; CHECK-NEXT: vmov.f32 s25, s7 ; CHECK-NEXT: vstrw.32 q5, [r1, #112] -; CHECK-NEXT: vmov.f32 s15, s31 -; CHECK-NEXT: vldrw.u32 q7, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s11 -; CHECK-NEXT: vldrw.u32 q2, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s25, s28 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s27, s1 +; CHECK-NEXT: vmov.f32 s27, s19 +; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s29, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s31, s1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s11 -; CHECK-NEXT: vstrw.32 q4, [r1, #128] -; CHECK-NEXT: vmov.f32 s26, s0 -; CHECK-NEXT: vstrw.32 q3, [r1, #80] +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vstrw.32 q2, [r1, #128] +; CHECK-NEXT: vmov.f32 s30, s0 +; CHECK-NEXT: vstrw.32 q6, [r1, #80] ; CHECK-NEXT: vmov.f64 d0, d2 -; CHECK-NEXT: vstrw.32 q6, [r1, #96] -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vstrw.32 q7, [r1, #96] +; CHECK-NEXT: vmov.f32 s1, s12 ; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s28 +; CHECK-NEXT: vmov.f32 s2, s16 ; CHECK-NEXT: vstrw.32 q0, [r1, #48] -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s28, s5 +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s16, s13 ; CHECK-NEXT: vstrw.32 q0, [r1, #144] ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s31, s6 +; CHECK-NEXT: vmov.f32 s19, s14 ; CHECK-NEXT: vstrw.32 q0, [r1, #160] ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s30, s10 +; CHECK-NEXT: vmov.f32 s18, s6 ; CHECK-NEXT: vstrw.32 q0, [r1, #176] ; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q7, [r1, #64] +; CHECK-NEXT: vstrw.32 q4, [r1, #64] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #160 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -1433,14 +1386,18 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) { ; CHECK-LABEL: vst3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r3 -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: vmov q0[3], q0[1], r12, r2 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrd r2, r12, [r0] +; CHECK-NEXT: ldrd r3, lr, [r0, #8] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrd r2, r0, [r0, #16] +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.32 q0[1], r12 +; CHECK-NEXT: vmov.32 q1[1], lr +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vmov.32 q1[1], r0 ; CHECK-NEXT: vins.f16 s0, s2 @@ -1453,16 +1410,16 @@ ; CHECK-NEXT: vmovx.f16 s12, s1 ; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vmovx.f16 s1, s3 ; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: vmovx.f16 s8, s5 ; CHECK-NEXT: vins.f16 s5, s12 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmovx.f16 s5, s3 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vins.f16 s1, s8 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: strd r2, r0, [r1, #16] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 %l1 = load <4 x half>, <4 x half>* %s1, align 4 @@ -1491,9 +1448,9 @@ ; CHECK-NEXT: vmovx.f16 s24, s6 ; CHECK-NEXT: vmov.f64 d0, d6 ; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f32 s17, s4 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vdup.32 q4, r0 +; CHECK-NEXT: vmov.f32 s18, s4 ; CHECK-NEXT: vmov.f32 s3, s13 ; CHECK-NEXT: vins.f16 s17, s20 ; CHECK-NEXT: vins.f16 s3, s9 @@ -1506,17 +1463,17 @@ ; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmovx.f16 s24, s7 ; CHECK-NEXT: vmovx.f16 s23, s11 -; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vrev32.16 q2, q2 ; CHECK-NEXT: vins.f16 s23, s24 -; CHECK-NEXT: vdup.32 q6, r0 +; CHECK-NEXT: vmov.f32 s25, s15 ; CHECK-NEXT: vmov.f32 s22, s7 ; CHECK-NEXT: vmovx.f16 s28, s21 +; CHECK-NEXT: vmov.f32 s26, s15 ; CHECK-NEXT: vins.f16 s25, s28 ; CHECK-NEXT: vmovx.f16 s28, s26 ; CHECK-NEXT: vins.f16 s22, s28 ; CHECK-NEXT: vmovx.f16 s28, s13 ; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vrev32.16 q2, q2 ; CHECK-NEXT: vins.f16 s4, s28 ; CHECK-NEXT: vmovx.f16 s28, s14 ; CHECK-NEXT: vins.f16 s6, s28 @@ -1561,141 +1518,143 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #144 ; CHECK-NEXT: sub sp, #144 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmovx.f16 s0, s12 -; CHECK-NEXT: vmov q6, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q5, [r0, #80] +; CHECK-NEXT: vldrw.u32 q7, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmovx.f16 s0, s23 +; CHECK-NEXT: vmovx.f16 s7, s11 +; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vmov.f32 s9, s31 +; CHECK-NEXT: vmov.f32 s6, s23 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov.f32 s10, s31 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vins.f16 s9, s0 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vins.f16 s6, s0 +; CHECK-NEXT: vmovx.f16 s0, s16 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vmovx.f16 s20, s14 -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s0, s12 +; CHECK-NEXT: vldrw.u32 q6, [r0, #64] +; CHECK-NEXT: vmov.f64 d0, d6 +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s0, s16 +; CHECK-NEXT: vmov.f32 s10, s6 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vins.f16 s3, s13 +; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vmov.f32 s9, s24 +; CHECK-NEXT: vins.f16 s3, s17 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.f32 s5, s8 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vdup.32 q4, r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vins.f16 s17, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vins.f16 s9, s0 +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vmovx.f16 s0, s10 ; CHECK-NEXT: vins.f16 s6, s0 +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s9, s20 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov.f32 s10, s20 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.f64 d0, d14 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmov.f32 s20, s21 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.f32 s3, s29 +; CHECK-NEXT: vins.f16 s3, s5 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovx.f16 s0, s28 +; CHECK-NEXT: vins.f16 s9, s0 +; CHECK-NEXT: vmov.f32 s5, s28 ; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s20, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: vmovx.f16 s23, s15 -; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vins.f16 s23, s0 -; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmovx.f16 s0, s21 -; CHECK-NEXT: vdup.32 q1, r2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vins.f16 s22, s0 -; CHECK-NEXT: vmovx.f16 s0, s12 -; CHECK-NEXT: vmov.f32 s6, s22 -; CHECK-NEXT: vldrw.u32 q6, [r0, #80] -; CHECK-NEXT: vstrw.32 q1, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vstrw.32 q4, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d14, d2 +; CHECK-NEXT: vins.f16 s6, s0 +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s16, s26 +; CHECK-NEXT: vmovx.f16 s8, s6 +; CHECK-NEXT: vins.f16 s8, s16 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.16 q2[3], r0 ; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vins.f16 s28, s12 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov.16 q7[4], r2 -; CHECK-NEXT: vmov.f32 s8, s9 -; CHECK-NEXT: vmov.f32 s31, s5 -; CHECK-NEXT: vins.f16 s31, s13 -; CHECK-NEXT: vmov.f32 s29, s4 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vins.f16 s30, s0 -; CHECK-NEXT: vmov q0, q3 -; CHECK-NEXT: vmov.f32 s6, s30 -; CHECK-NEXT: vmovx.f16 s12, s26 -; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vins.f16 s4, s12 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmovx.f16 s11, s7 +; CHECK-NEXT: vmovx.f16 s16, s27 +; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vins.f16 s11, s16 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: vmov.f32 s10, s27 +; CHECK-NEXT: vmovx.f16 s16, s9 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vins.f16 s1, s16 +; CHECK-NEXT: vmovx.f16 s16, s2 +; CHECK-NEXT: vins.f16 s10, s16 +; CHECK-NEXT: vmovx.f16 s16, s29 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vins.f16 s20, s16 +; CHECK-NEXT: vmovx.f16 s16, s30 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s7, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s12, s27 -; CHECK-NEXT: vins.f16 s7, s12 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.f32 s6, s27 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vdup.32 q3, r0 +; CHECK-NEXT: vins.f16 s22, s16 +; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s23, s22 +; CHECK-NEXT: vmov.f32 s22, s30 +; CHECK-NEXT: vrev32.16 q3, q0 +; CHECK-NEXT: vmovx.f16 s16, s21 ; CHECK-NEXT: vmov.f32 s24, s25 ; CHECK-NEXT: vins.f16 s13, s16 ; CHECK-NEXT: vmovx.f16 s16, s14 -; CHECK-NEXT: vins.f16 s6, s16 -; CHECK-NEXT: vmovx.f16 s16, s1 +; CHECK-NEXT: vins.f16 s22, s16 +; CHECK-NEXT: vmovx.f16 s16, s5 ; CHECK-NEXT: vins.f16 s24, s16 -; CHECK-NEXT: vmovx.f16 s16, s2 +; CHECK-NEXT: vmovx.f16 s16, s6 ; CHECK-NEXT: vins.f16 s26, s16 -; CHECK-NEXT: vmov.f32 s14, s6 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s27, s26 -; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s26, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s16, s25 -; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vrev32.16 q0, q0 -; CHECK-NEXT: vins.f16 s1, s16 -; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vins.f16 s26, s16 -; CHECK-NEXT: vmovx.f16 s16, s13 -; CHECK-NEXT: vmov.f32 s2, s26 -; CHECK-NEXT: vins.f16 s8, s16 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s16, s14 -; CHECK-NEXT: vins.f16 s10, s16 +; CHECK-NEXT: vmov.f32 s26, s6 ; CHECK-NEXT: vrev32.16 q4, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s11, s10 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vmovx.f16 s4, s25 +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s17, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s5, s1 -; CHECK-NEXT: vins.f16 s17, s12 +; CHECK-NEXT: vmovx.f16 s28, s18 +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vins.f16 s26, s28 +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q7, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s29, s5 +; CHECK-NEXT: vmov.f32 s30, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vstrw.32 q7, [r1] ; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s10, s12 -; CHECK-NEXT: vldrw.u32 q3, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vmov.f32 s13, s1 -; CHECK-NEXT: vstrw.32 q1, [r1, #80] -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s9, s17 -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vmov.f32 s21, s1 -; CHECK-NEXT: vmov.f32 s22, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vmov.f32 s29, s1 -; CHECK-NEXT: vmov.f32 s30, s2 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s18 -; CHECK-NEXT: vstrw.32 q7, [r1, #48] -; CHECK-NEXT: vmov.f32 s25, s1 -; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vmov.f32 s26, s2 -; CHECK-NEXT: vstrw.32 q6, [r1, #64] +; CHECK-NEXT: vmov.f32 s21, s13 +; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s25, s17 +; CHECK-NEXT: vstrw.32 q2, [r1, #32] +; CHECK-NEXT: vmov.f32 s22, s14 +; CHECK-NEXT: vstrw.32 q0, [r1, #80] +; CHECK-NEXT: vmov.f32 s26, s18 +; CHECK-NEXT: vstrw.32 q5, [r1, #64] +; CHECK-NEXT: vstrw.32 q6, [r1, #16] ; CHECK-NEXT: add sp, #144 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -1045,19 +1045,21 @@ ; CHECK-LABEL: vst4_v2f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldmia r0, {s4, s5} -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmov.f64 d0, d2 -; CHECK-NEXT: vdup.32 q2, r0 -; CHECK-NEXT: vins.f16 s0, s5 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vins.f16 s1, s9 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vmovx.f16 s4, s9 -; CHECK-NEXT: vins.f16 s2, s12 -; CHECK-NEXT: vmovx.f16 s3, s8 -; CHECK-NEXT: vins.f16 s3, s4 -; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: vldr s0, [r0, #8] +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vmov.f32 s1, s0 +; CHECK-NEXT: vmovx.f16 s14, s0 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vins.f16 s14, s0 +; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vstrh.16 q2, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 @@ -1078,23 +1080,29 @@ define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vst4_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: ldrd lr, r12, [r0] ; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r12, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r4 +; CHECK-NEXT: vmov.32 q0[0], lr +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.32 q0[1], r12 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: ldrd r2, r0, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r0 -; CHECK-NEXT: vins.f16 s8, s3 ; CHECK-NEXT: vmovx.f16 s12, s3 +; CHECK-NEXT: vins.f16 s8, s5 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.f32 s6, s4 +; CHECK-NEXT: vmov.f32 s7, s5 ; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vins.f16 s9, s7 +; CHECK-NEXT: vins.f16 s9, s5 ; CHECK-NEXT: vmovx.f16 s10, s1 ; CHECK-NEXT: vins.f16 s10, s12 -; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vmovx.f16 s12, s5 ; CHECK-NEXT: vmovx.f16 s11, s5 ; CHECK-NEXT: vins.f16 s11, s12 ; CHECK-NEXT: vstrh.16 q2, [r1, #16] @@ -1109,7 +1117,7 @@ ; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: vstrh.16 q0, [r1] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 %l1 = load <4 x half>, <4 x half>* %s1, align 4