Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -7761,6 +7761,92 @@ DAG.getConstant(ARMCC::NE, dl, MVT::i32)); } +static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, + ArrayRef ShuffleMask, + SelectionDAG &DAG) { + // Attempt to lower the vector shuffle using as many whole register movs as + // possible. This is useful for types smaller than 32bits, which would + // often otherwise become a series for grp movs. + SDLoc dl(Op); + EVT VT = Op.getValueType(); + if (VT.getScalarSizeInBits() >= 32) + return SDValue(); + + assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && + "Unexpected vector type"); + int NumElts = VT.getVectorNumElements(); + int QuarterSize = NumElts / 4; + // The four final parts of the vector, as i32's + SDValue Parts[4]; + + // Look for full lane vmovs like <0,1,2,3> or etc, (but not + // ), returning the vmov lane index + auto getMovIdx = [](ArrayRef ShuffleMask, int Start, int Length) { + // Detect which mov lane this would be from the first non-undef element. + int MovIdx = -1; + for (int i = 0; i < Length; i++) { + if (ShuffleMask[Start + i] >= 0) { + if (ShuffleMask[Start + i] % Length != i) + return -1; + MovIdx = ShuffleMask[Start + i] / Length; + break; + } + } + // If all items are undef, leave this for other combines + if (MovIdx == -1) + return -1; + // Check the remaining values are the correct part of the same mov + for (int i = 1; i < Length; i++) { + if (ShuffleMask[Start + i] >= 0 && + (ShuffleMask[Start + i] / Length != MovIdx || + ShuffleMask[Start + i] % Length != i)) + return -1; + } + return MovIdx; + }; + + for (int Part = 0; Part < 4; ++Part) { + // Does this part look like a mov + int Elt; + if ((Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize)) != -1) { + SDValue Input = Op->getOperand(0); + if (Elt >= 4) { + Input = Op->getOperand(1); + Elt -= 4; + } + SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input); + Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast, + DAG.getConstant(Elt, dl, MVT::i32)); + } + } + + // Nothing interesting found, just return + if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3]) + return SDValue(); + + // The other parts need to be built with the old shuffle vector, cast to a + // v4i32 and extract_vector_elts + if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) { + std::vector NewShuffleMask; + for (int Part = 0; Part < 4; ++Part) + for (int i = 0; i < QuarterSize; i++) + NewShuffleMask.push_back( + Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); + SDValue NewShuffle = DAG.getVectorShuffle( + VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); + SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle); + + for (int Part = 0; Part < 4; ++Part) + if (!Parts[Part]) + Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + BitCast, DAG.getConstant(Part, dl, MVT::i32)); + } + // Build a vector out of the various parts and bitcast it back to the original + // type. + SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts); + return DAG.getBitcast(VT, NewVec); +} + static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); @@ -7955,6 +8041,10 @@ if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) return NewOp; + if (ST->hasMVEIntegerOps()) + if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG)) + return NewOp; + return SDValue(); } Index: llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll @@ -260,9 +260,8 @@ ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov.u16 r0, q0[0] ; CHECK-NEXT: vdup.16 q1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q1[7], r0 ; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vcmp.i16 ne, q1, zr Index: llvm/test/CodeGen/Thumb2/mve-shuffle.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -98,11 +98,7 @@ ; CHECK-LABEL: shuffle3_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r0, q0[7] ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.u16 r0, q1[6] ; CHECK-NEXT: vmov.16 q0[3], r0 @@ -114,6 +110,7 @@ ; CHECK-NEXT: vmov.16 q0[6], r0 ; CHECK-NEXT: vmov.u16 r0, q1[0] ; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s0, s6 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> @@ -387,16 +384,11 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) { ; CHECK-LABEL: shuffle3_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmovx.f16 s8, s3 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmov.16 q1[2], r1 ; CHECK-NEXT: vmov.16 q1[3], r0 ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmovx.f16 s8, s0 @@ -407,6 +399,7 @@ ; CHECK-NEXT: vmov.16 q1[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-shufflemov.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shufflemov.ll +++ llvm/test/CodeGen/Thumb2/mve-shufflemov.ll @@ -7,23 +7,11 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_45670123(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_45670123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> @@ -33,23 +21,11 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_67452301(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_67452301: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> @@ -94,23 +70,8 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_0123cdef(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_0123cdef: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> @@ -120,14 +81,10 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_u7u5u3u1(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_u7u5u3u1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -138,14 +95,10 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_6u4u2u0u(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_6u4u2u0u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -179,39 +132,11 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdef89ab45670123(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cdef89ab45670123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> @@ -314,39 +239,11 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_0123ghij4567klmn(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_0123ghij4567klmn: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> @@ -356,31 +253,11 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdeu89ub4u67u123(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cdeu89ub4u67u123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> @@ -390,23 +267,11 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cduu8uubuu67u12u(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cduu8uubuu67u12u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> @@ -416,14 +281,10 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cuuuuuubuu6uuu2u(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cuuuuuubuu6uuu2u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.8 q1[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.8 q1[10], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -434,36 +295,16 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdef89ab45u700123(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cdef89ab45u700123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %s1, <16 x i8> %s2, <16 x i32> @@ -477,26 +318,10 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_f16_45670123(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_45670123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmovx.f16 s8, s3 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -507,26 +332,10 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_f16_67452301(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_67452301: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -576,27 +385,8 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_f16_0123cdef(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_0123cdef: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q0[0], r1 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmovx.f16 s8, s9 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %s1, <8 x half> %s2, <8 x i32> @@ -606,18 +396,10 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_f16_u7u5u3u1(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_u7u5u3u1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -628,14 +410,10 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_f16_6u4u2u0u(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_6u4u2u0u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: