diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -17217,7 +17217,7 @@ } // Some combines for the MVETrunc truncations legalizer helper. Also lowers the -// node into a buildvector after legalizeOps. +// node into stack operations after legalizeOps. SDValue ARMTargetLowering::PerformMVETruncCombine( SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -17265,7 +17265,14 @@ } } - auto LowerToBuildVec = [&]() { + // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the + // truncate to a buildvector to allow the generic optimisations to kick in. + if (all_of(N->ops(), [](SDValue Op) { + return Op.getOpcode() == ISD::BUILD_VECTOR || + Op.getOpcode() == ISD::VECTOR_SHUFFLE || + (Op.getOpcode() == ISD::BITCAST && + Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR); + })) { SmallVector Extracts; for (unsigned Op = 0; Op < N->getNumOperands(); Op++) { SDValue O = N->getOperand(Op); @@ -17276,26 +17283,40 @@ } } return DAG.getBuildVector(VT, DL, Extracts); - }; - - // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the - // truncate to a buildvector to allow the generic optimisations to kick in. - if (all_of(N->ops(), [](SDValue Op) { - return Op.getOpcode() == ISD::BUILD_VECTOR || - Op.getOpcode() == ISD::VECTOR_SHUFFLE || - (Op.getOpcode() == ISD::BITCAST && - Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR); - })) - return LowerToBuildVec(); + } // If we are late in the legalization process and nothing has optimised - // the trunc to anything better lower it to a series of extracts and a - // buildvector. + // the trunc to anything better, lower it to a stack store and reload, + // performing the truncation whilst keeping the lanes in the correct order: + // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack; if (DCI.isBeforeLegalizeOps()) return SDValue(); - SDValue BuildVec = LowerToBuildVec(); - return LowerBUILD_VECTOR(BuildVec, DCI.DAG, Subtarget); + SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4)); + int SPFI = cast(StackPtr.getNode())->getIndex(); + int NumIns = N->getNumOperands(); + assert((NumIns == 2 || NumIns == 4) && + "Expected 2 or 4 inputs to an MVETrunc"); + EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + if (N->getNumOperands() == 4) + StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext()); + + SmallVector Chains; + for (int I = 0; I < NumIns; I++) { + SDValue Ptr = DAG.getNode( + ISD::ADD, DL, StackPtr.getValueType(), StackPtr, + DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType())); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), SPFI, I * 16 / NumIns); + SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I), + Ptr, MPI, StoreVT, Align(4)); + Chains.push_back(Ch); + } + + SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0); + return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4)); } SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -275,105 +275,88 @@ define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8i32(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: ext_add_ashr_trunc_i8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.u8 r1, q1[12] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.u8 r1, q1[13] +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov.u8 r1, q1[14] +; CHECK-NEXT: vmov.u8 r2, q1[12] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q1[15] +; CHECK-NEXT: vmov.u8 r2, q1[13] ; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: vmov.u8 r2, q0[12] ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: vmov.u8 r4, q1[6] -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q1[10] ; CHECK-NEXT: vmovlb.s8 q4, q4 -; CHECK-NEXT: vmov.u8 r1, q1[0] +; CHECK-NEXT: vmov.u8 r2, q1[8] ; CHECK-NEXT: vmovlb.s16 q4, q4 -; CHECK-NEXT: vmov.u8 r5, q1[4] ; CHECK-NEXT: vadd.i32 q3, q4, q3 ; CHECK-NEXT: vshr.u32 q3, q3, #1 -; CHECK-NEXT: vmov lr, r12, d7 -; CHECK-NEXT: vmov r3, r2, d6 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.u8 r1, q1[1] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vstrb.32 q3, [r0, #12] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q1[11] +; CHECK-NEXT: vmov.u8 r2, q1[9] +; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q1[6] ; CHECK-NEXT: vmovlb.s8 q4, q4 +; CHECK-NEXT: vmov.u8 r2, q1[4] ; CHECK-NEXT: vmovlb.s16 q4, q4 ; CHECK-NEXT: vadd.i32 q3, q4, q3 -; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 -; CHECK-NEXT: vmov.u8 r4, q1[7] -; CHECK-NEXT: vmov.u8 r5, q1[5] -; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 -; CHECK-NEXT: vmov.u8 r4, q0[6] -; CHECK-NEXT: vmov.u8 r5, q0[4] ; CHECK-NEXT: vshr.u32 q3, q3, #1 -; CHECK-NEXT: vmov q5[2], q5[0], r5, r4 -; CHECK-NEXT: vmov.u8 r4, q0[7] -; CHECK-NEXT: vmov.u8 r5, q0[5] -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov q5[3], q5[1], r5, r4 -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: vmovlb.s8 q5, q5 -; CHECK-NEXT: vmov.u8 r5, q0[8] -; CHECK-NEXT: vmovlb.s16 q5, q5 -; CHECK-NEXT: vmov r1, r0, d6 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov q5[2], q5[0], r5, r4 -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: vmov.u8 r5, q0[9] -; CHECK-NEXT: vmov q5[3], q5[1], r5, r4 -; CHECK-NEXT: vmov.8 q0[0], r1 -; CHECK-NEXT: vmov.u8 r4, q1[10] -; CHECK-NEXT: vmov.u8 r5, q1[8] -; CHECK-NEXT: vmov q6[2], q6[0], r5, r4 -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, r1, d7 -; CHECK-NEXT: vmov.u8 r4, q1[11] -; CHECK-NEXT: vmov.u8 r5, q1[9] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov q6[3], q6[1], r5, r4 -; CHECK-NEXT: vshr.u32 q4, q4, #1 -; CHECK-NEXT: vmov.8 q0[3], r1 -; CHECK-NEXT: vmov r0, r1, d8 -; CHECK-NEXT: vand q1, q6, q2 -; CHECK-NEXT: vmovlb.s8 q2, q5 -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vmov r4, r5, d9 -; CHECK-NEXT: vmov.8 q0[5], r1 -; CHECK-NEXT: vshr.u32 q1, q1, #1 -; CHECK-NEXT: vmov.8 q0[6], r4 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.8 q0[7], r5 -; CHECK-NEXT: vmov r4, r5, d2 -; CHECK-NEXT: vmov.8 q0[8], r4 -; CHECK-NEXT: vmov.8 q0[9], r5 -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.8 q0[11], r1 -; CHECK-NEXT: vmov.8 q0[12], r3 -; CHECK-NEXT: vmov.8 q0[13], r2 -; CHECK-NEXT: vmov.8 q0[14], lr -; CHECK-NEXT: vmov.8 q0[15], r12 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vstrb.32 q3, [r0, #8] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q1[7] +; CHECK-NEXT: vmov.u8 r2, q1[5] +; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q1[2] +; CHECK-NEXT: vmovlb.s8 q4, q4 +; CHECK-NEXT: vmov.u8 r2, q1[0] +; CHECK-NEXT: vmovlb.s16 q4, q4 +; CHECK-NEXT: vadd.i32 q3, q4, q3 +; CHECK-NEXT: vshr.u32 q3, q3, #1 +; CHECK-NEXT: vstrb.32 q3, [r0, #4] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q1[3] +; CHECK-NEXT: vmov.u8 r2, q1[1] +; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vand q1, q3, q2 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmovlb.s8 q0, q2 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr entry: %sa = sext <16 x i8> %a to <16 x i32> %sb = zext <16 x i8> %b to <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll @@ -362,23 +362,16 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: vmov.16 q2[0], r4 -; CHECK-NEXT: vmov lr, r12, d3 -; CHECK-NEXT: vmov r3, r2, d2 -; CHECK-NEXT: vldrb.u16 q1, [r1] -; CHECK-NEXT: vmov r1, r4, d1 -; CHECK-NEXT: vmov.16 q2[1], r5 -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.16 q2[3], r4 -; CHECK-NEXT: vmov.16 q2[4], r3 -; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vmov.16 q2[6], lr -; CHECK-NEXT: vmov.16 q2[7], r12 -; CHECK-NEXT: vstrh.16 q2, [r0, q1] -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vstrh.32 q1, [r2, #8] +; CHECK-NEXT: vstrh.32 q0, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vstrh.16 q1, [r0, q0] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -374,38 +374,18 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: vmov.8 q4[0], r4 -; CHECK-NEXT: vmov lr, r12, d7 -; CHECK-NEXT: vmov r3, r2, d6 -; CHECK-NEXT: vldrb.u8 q3, [r1] -; CHECK-NEXT: vmov r1, r4, d1 -; CHECK-NEXT: vmov.8 q4[1], r5 -; CHECK-NEXT: vmov.8 q4[2], r1 -; CHECK-NEXT: vmov r1, r5, d2 -; CHECK-NEXT: vmov.8 q4[3], r4 -; CHECK-NEXT: vmov.8 q4[4], r1 -; CHECK-NEXT: vmov r1, r4, d3 -; CHECK-NEXT: vmov.8 q4[5], r5 -; CHECK-NEXT: vmov.8 q4[6], r1 -; CHECK-NEXT: vmov r1, r5, d4 -; CHECK-NEXT: vmov.8 q4[7], r4 -; CHECK-NEXT: vmov.8 q4[8], r1 -; CHECK-NEXT: vmov r1, r4, d5 -; CHECK-NEXT: vmov.8 q4[9], r5 -; CHECK-NEXT: vmov.8 q4[10], r1 -; CHECK-NEXT: vmov.8 q4[11], r4 -; CHECK-NEXT: vmov.8 q4[12], r3 -; CHECK-NEXT: vmov.8 q4[13], r2 -; CHECK-NEXT: vmov.8 q4[14], lr -; CHECK-NEXT: vmov.8 q4[15], r12 -; CHECK-NEXT: vstrb.8 q4, [r0, q3] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vstrb.32 q3, [r2, #12] +; CHECK-NEXT: vstrb.32 q2, [r2, #8] +; CHECK-NEXT: vstrb.32 q1, [r2, #4] +; CHECK-NEXT: vstrb.32 q0, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vstrb.8 q1, [r0, q0] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: bx lr entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 %offs.zext = zext <16 x i8> %offs to <16 x i32> @@ -418,40 +398,15 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r3, q0[0] -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.8 q2[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[1] -; CHECK-NEXT: vmov.8 q2[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov.8 q2[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: vmov.8 q2[3], r3 -; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: vmov.8 q2[4], r3 -; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: vmov.8 q2[5], r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov.8 q2[6], r3 -; CHECK-NEXT: vmov.u16 r3, q0[7] -; CHECK-NEXT: vmov.8 q2[7], r3 -; CHECK-NEXT: vmov.u16 r3, q1[0] -; CHECK-NEXT: vmov.8 q2[8], r3 -; CHECK-NEXT: vmov.u16 r3, q1[1] -; CHECK-NEXT: vmov.8 q2[9], r3 -; CHECK-NEXT: vmov.u16 r3, q1[2] -; CHECK-NEXT: vmov.8 q2[10], r3 -; CHECK-NEXT: vmov.u16 r3, q1[3] -; CHECK-NEXT: vmov.8 q2[11], r3 -; CHECK-NEXT: vmov.u16 r3, q1[4] -; CHECK-NEXT: vmov.8 q2[12], r3 -; CHECK-NEXT: vmov.u16 r3, q1[5] -; CHECK-NEXT: vmov.8 q2[13], r3 -; CHECK-NEXT: vmov.u16 r3, q1[6] -; CHECK-NEXT: vmov.8 q2[14], r3 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vstrb.16 q1, [r2, #8] +; CHECK-NEXT: vstrb.16 q0, [r2] ; CHECK-NEXT: vldrb.u8 q0, [r1] -; CHECK-NEXT: vmov.8 q2[15], r2 -; CHECK-NEXT: vstrb.8 q2, [r0, q0] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vstrb.8 q1, [r0, q0] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-sext.ll b/llvm/test/CodeGen/Thumb2/mve-sext.ll --- a/llvm/test/CodeGen/Thumb2/mve-sext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-sext.ll @@ -434,39 +434,13 @@ define arm_aapcs_vfpcc <16 x i8> @trunc_v16i16_v16i8(<16 x i16> %src) { ; CHECK-LABEL: trunc_v16i16_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrb.16 q1, [r0, #8] +; CHECK-NEXT: vstrb.16 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %0 = trunc <16 x i16> %src to <16 x i8> @@ -476,19 +450,13 @@ define arm_aapcs_vfpcc <8 x i16> @trunc_v8i32_v8i16(<8 x i32> %src) { ; CHECK-LABEL: trunc_v8i32_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.16 q0[7], r1 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrh.32 q1, [r0, #8] +; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %0 = trunc <8 x i32> %src to <8 x i16> @@ -498,34 +466,15 @@ define arm_aapcs_vfpcc <16 x i8> @trunc_v16i32_v16i8(<16 x i32> %src) { ; CHECK-LABEL: trunc_v16i32_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, r1, d8 -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: vmov r0, r1, d9 -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.8 q0[3], r1 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.8 q0[5], r1 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.8 q0[7], r1 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.8 q0[9], r1 -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.8 q0[11], r1 -; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov r0, r1, d7 -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.8 q0[15], r1 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrb.32 q3, [r0, #12] +; CHECK-NEXT: vstrb.32 q2, [r0, #8] +; CHECK-NEXT: vstrb.32 q1, [r0, #4] +; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %0 = trunc <16 x i32> %src to <16 x i8>