Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -139,7 +139,9 @@ PREDICATE_CAST, // Predicate cast for MVE i1 types VECTOR_REG_CAST, // Reinterpret the current contents of a vector register - MVETRUNC, // Legalization aid for truncating two vectors into one. + MVESEXT, // Legalization aids for extending a vector into two/four vectors. + MVEZEXT, // or truncating two/four vectors into one. Eventually becomes + MVETRUNC, // stack store/load sequence, if not optimized to anything else. VCMP, // Vector compare. VCMPZ, // Vector compare to zero. @@ -419,6 +421,7 @@ SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -452,6 +452,12 @@ setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); } + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); } @@ -1681,6 +1687,8 @@ MAKE_CASE(ARMISD::WIN__DBZCHK) MAKE_CASE(ARMISD::PREDICATE_CAST) MAKE_CASE(ARMISD::VECTOR_REG_CAST) + MAKE_CASE(ARMISD::MVESEXT) + MAKE_CASE(ARMISD::MVEZEXT) MAKE_CASE(ARMISD::MVETRUNC) MAKE_CASE(ARMISD::VCMP) MAKE_CASE(ARMISD::VCMPZ) @@ -8981,6 +8989,39 @@ return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi); } +static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasMVEIntegerOps()) + return SDValue(); + + // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC. + + EVT ToVT = N->getValueType(0); + if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16) + return SDValue(); + SDValue Op = N->getOperand(0); + EVT FromVT = Op.getValueType(); + if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8) + return SDValue(); + + SDLoc DL(N); + EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); + if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) + ExtVT = MVT::v8i16; + + unsigned Opcode = + N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT; + SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op); + SDValue Ext1 = Ext.getValue(1); + + if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) { + Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext); + Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1); + } + + return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1); +} + /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each /// element has been zero/sign-extended, depending on the isSigned parameter, /// from an integer type half its size. @@ -10108,6 +10149,8 @@ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget); + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); @@ -10253,6 +10296,10 @@ case ISD::TRUNCATE: Res = LowerTruncate(N, DAG, Subtarget); break; + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + Res = LowerVectorExtend(N, DAG, Subtarget); + break; } if (Res.getNode()) Results.push_back(Res); @@ -16505,10 +16552,8 @@ EVT FromEltVT = FromVT.getVectorElementType(); unsigned NumElements = 0; - if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) + if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8) NumElements = 4; - if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) - NumElements = 8; if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16) NumElements = 4; if (NumElements == 0 || @@ -17351,6 +17396,178 @@ return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4)); } +// Take a MVEEXT(load x) and split that into (extload x, extload x+8) +static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, + SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + LoadSDNode *LD = dyn_cast(N0.getNode()); + if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed()) + return SDValue(); + + EVT FromVT = LD->getMemoryVT(); + EVT ToVT = N->getValueType(0); + if (!ToVT.isVector()) + return SDValue(); + assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2); + EVT ToEltVT = ToVT.getVectorElementType(); + EVT FromEltVT = FromVT.getVectorElementType(); + + unsigned NumElements = 0; + if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) + NumElements = 4; + if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) + NumElements = 8; + assert(NumElements != 0); + + ISD::LoadExtType NewExtType = + N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + if (LD->getExtensionType() != ISD::NON_EXTLOAD && + LD->getExtensionType() != ISD::EXTLOAD && + LD->getExtensionType() != NewExtType) + return SDValue(); + + LLVMContext &C = *DAG.getContext(); + SDLoc DL(LD); + // Details about the old load + SDValue Ch = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + Align Alignment = LD->getOriginalAlign(); + MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); + AAMDNodes AAInfo = LD->getAAInfo(); + + SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); + EVT NewFromVT = EVT::getVectorVT( + C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); + EVT NewToVT = EVT::getVectorVT( + C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); + + SmallVector Loads; + SmallVector Chains; + for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { + unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; + SDValue NewPtr = + DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); + + SDValue NewLoad = + DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, + LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, + Alignment, MMOFlags, AAInfo); + Loads.push_back(NewLoad); + Chains.push_back(SDValue(NewLoad.getNode(), 1)); + } + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); + return DAG.getMergeValues(Loads, DL); +} + +// Perform combines for MVEEXT. If it has not be optimized to anything better +// before lowering, it gets converted to stack store and extloads performing the +// extend whilst still keeping the same lane ordering. +SDValue ARMTargetLowering::PerformMVEExtCombine( + SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDLoc DL(N); + assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements"); + assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type"); + + EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT( + *DAG.getContext()); + auto Extend = [&](SDValue V) { + SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V); + return N->getOpcode() == ARMISD::MVESEXT + ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT, + DAG.getValueType(ExtVT)) + : DAG.getZeroExtendInReg(VVT, DL, ExtVT); + }; + + // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP) + if (N->getOperand(0).getOpcode() == ARMISD::VDUP) { + SDValue Ext = Extend(N->getOperand(0)); + return DAG.getMergeValues({Ext, Ext}, DL); + } + + // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG + if (auto *SVN = dyn_cast(N->getOperand(0))) { + ArrayRef Mask = SVN->getMask(); + assert(Mask.size() == 2 * VT.getVectorNumElements()); + assert(Mask.size() == SVN->getValueType(0).getVectorNumElements()); + unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16; + SDValue Op0 = SVN->getOperand(0); + SDValue Op1 = SVN->getOperand(1); + + auto CheckInregMask = [&](int Start, int Offset) { + for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx) + if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset) + return false; + return true; + }; + SDValue V0 = SDValue(N, 0); + SDValue V1 = SDValue(N, 1); + if (CheckInregMask(0, 0)) + V0 = Extend(Op0); + else if (CheckInregMask(0, 1)) + V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0)); + else if (CheckInregMask(0, Mask.size())) + V0 = Extend(Op1); + else if (CheckInregMask(0, Mask.size() + 1)) + V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1)); + + if (CheckInregMask(VT.getVectorNumElements(), Mask.size())) + V1 = Extend(Op1); + else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1)) + V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1)); + else if (CheckInregMask(VT.getVectorNumElements(), 0)) + V1 = Extend(Op0); + else if (CheckInregMask(VT.getVectorNumElements(), 1)) + V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0)); + + if (V0.getNode() != N || V1.getNode() != N) + return DAG.getMergeValues({V0, V1}, DL); + } + + // MVEEXT(load) -> extload, extload + if (N->getOperand(0)->getOpcode() == ISD::LOAD) + if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG)) + return L; + + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + // Lower to a stack store and reload: + // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8; + SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4)); + int SPFI = cast(StackPtr.getNode())->getIndex(); + int NumOuts = N->getNumValues(); + assert((NumOuts == 2 || NumOuts == 4) && + "Expected 2 or 4 outputs to an MVEEXT"); + EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT( + *DAG.getContext()); + if (N->getNumOperands() == 4) + LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext()); + + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0); + SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0), + StackPtr, MPI, Align(4)); + + SmallVector Loads; + for (int I = 0; I < NumOuts; I++) { + SDValue Ptr = DAG.getNode( + ISD::ADD, DL, StackPtr.getValueType(), StackPtr, + DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType())); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), SPFI, I * 16 / NumOuts); + SDValue Load = DAG.getExtLoad( + N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, + VT, Chain, Ptr, MPI, LoadVT, Align(4)); + Loads.push_back(Load); + } + + return DAG.getMergeValues(Loads, DL); +} + SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -17426,6 +17643,9 @@ return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget); case ARMISD::MVETRUNC: return PerformMVETruncCombine(N, DCI); + case ARMISD::MVESEXT: + case ARMISD::MVEZEXT: + return PerformMVEExtCombine(N, DCI); case ARMISD::VCMP: return PerformVCMPCombine(N, DCI, Subtarget); case ISD::VECREDUCE_ADD: Index: llvm/test/CodeGen/Thumb2/mve-gather-increment.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -550,77 +550,71 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill +; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: blt .LBB11_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: bic r8, r2, #7 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r12, r8, #8 +; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: vmov.i16 q1, #0x8 -; CHECK-NEXT: add.w r1, r4, r12, lsr #3 -; CHECK-NEXT: adr r4, .LCPI11_0 -; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill +; CHECK-NEXT: bic r12, r1, #7 +; CHECK-NEXT: add r1, sp, #8 +; CHECK-NEXT: sub.w r3, r12, #8 +; CHECK-NEXT: add.w r8, r5, r3, lsr #3 +; CHECK-NEXT: adr r5, .LCPI11_0 +; CHECK-NEXT: vldrw.u32 q0, [r5] ; CHECK-NEXT: .LBB11_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: dls lr, r8 ; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload ; CHECK-NEXT: .LBB11_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vmov.u16 r7, q2[6] -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r7 -; CHECK-NEXT: vmov.u16 r3, q2[7] -; CHECK-NEXT: vmov.u16 r7, q2[5] -; CHECK-NEXT: vmov.u16 r5, q2[2] -; CHECK-NEXT: vmov q4[3], q4[1], r7, r3 -; CHECK-NEXT: vmov.u16 r6, q2[0] -; CHECK-NEXT: vmovlb.s16 q4, q4 -; CHECK-NEXT: vmov q3[2], q3[0], r6, r5 +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: vldrh.s32 q4, [r1, #8] +; CHECK-NEXT: vldrh.s32 q3, [r1] +; CHECK-NEXT: vadd.i16 q2, q2, q1 ; CHECK-NEXT: vshl.i32 q4, q4, #1 -; CHECK-NEXT: vmov.u16 r5, q2[3] -; CHECK-NEXT: vmov.u16 r6, q2[1] -; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r6, r5 -; CHECK-NEXT: vmov r5, r6, d9 -; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vmov r3, r7, d8 ; CHECK-NEXT: vshl.i32 q3, q3, #1 -; CHECK-NEXT: vadd.i16 q2, q2, q1 +; CHECK-NEXT: vadd.i32 q4, q4, r0 ; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vmov r9, r10, d7 -; CHECK-NEXT: ldrh.w r12, [r5] -; CHECK-NEXT: vmov r5, r1, d6 -; CHECK-NEXT: ldrh.w r11, [r6] -; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov r1, r2, d9 +; CHECK-NEXT: vmov r6, r7, d7 +; CHECK-NEXT: vmov r3, r4, d8 +; CHECK-NEXT: ldrh.w r11, [r2] +; CHECK-NEXT: vmov r2, r9, d6 +; CHECK-NEXT: ldrh r6, [r6] ; CHECK-NEXT: ldrh r7, [r7] -; CHECK-NEXT: ldrh.w r6, [r9] -; CHECK-NEXT: ldrh.w r10, [r10] -; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q3[0], r5 -; CHECK-NEXT: vmov.16 q3[1], r1 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh.w r9, [r9] +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.16 q3[1], r9 ; CHECK-NEXT: vmov.16 q3[2], r6 -; CHECK-NEXT: vmov.16 q3[3], r10 +; CHECK-NEXT: vmov.16 q3[3], r7 ; CHECK-NEXT: vmov.16 q3[4], r3 -; CHECK-NEXT: vmov.16 q3[5], r7 -; CHECK-NEXT: vmov.16 q3[6], r12 +; CHECK-NEXT: vmov.16 q3[5], r4 +; CHECK-NEXT: vmov.16 q3[6], r1 +; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: vmov.16 q3[7], r11 -; CHECK-NEXT: vstrb.8 q3, [r4], #16 +; CHECK-NEXT: vstrb.8 q3, [r5], #16 ; CHECK-NEXT: le lr, .LBB11_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 -; CHECK-NEXT: cmp r8, r2 +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: cmp r12, r2 ; CHECK-NEXT: bne .LBB11_2 ; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -676,172 +670,147 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: .pad #136 +; CHECK-NEXT: sub sp, #136 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #64] @ 4-byte Spill ; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: str r2, [sp, #60] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #68] @ 4-byte Spill ; CHECK-NEXT: blt.w .LBB12_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: adr r6, .LCPI12_2 -; CHECK-NEXT: vldrw.u32 q1, [r6] -; CHECK-NEXT: movs r7, #1 +; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload +; CHECK-NEXT: adr r3, .LCPI12_2 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: bic r1, r1, #7 -; CHECK-NEXT: str r1, [sp, #52] @ 4-byte Spill -; CHECK-NEXT: sub.w r3, r1, #8 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q0, #0x18 -; CHECK-NEXT: add.w r1, r7, r3, lsr #3 -; CHECK-NEXT: adr r3, .LCPI12_0 -; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: adr r7, .LCPI12_1 -; CHECK-NEXT: str r1, [sp, #48] @ 4-byte Spill -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r7] -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: subs r1, #8 +; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill +; CHECK-NEXT: vmov.i16 q2, #0x18 +; CHECK-NEXT: add.w r1, r2, r1, lsr #3 +; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill +; CHECK-NEXT: adr r1, .LCPI12_0 +; CHECK-NEXT: adr r2, .LCPI12_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: add r2, sp, #120 +; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill ; CHECK-NEXT: .LBB12_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #48] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: add.w r10, sp, #104 ; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr.w r12, [sp, #56] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #8] @ 16-byte Reload ; CHECK-NEXT: .LBB12_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vmov.u16 r3, q5[6] -; CHECK-NEXT: vmov.u16 r5, q5[4] -; CHECK-NEXT: vmov q1[2], q1[0], r5, r3 -; CHECK-NEXT: vmov.u16 r3, q5[7] -; CHECK-NEXT: vmov.u16 r5, q5[5] -; CHECK-NEXT: vmov.u16 r4, q6[2] -; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 -; CHECK-NEXT: vmov.u16 r1, q6[0] -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmov.u16 r6, q5[0] -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r7, r5, d3 -; CHECK-NEXT: vmov r3, r8, d2 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r4 -; CHECK-NEXT: vmov.u16 r1, q6[3] -; CHECK-NEXT: vmov.u16 r4, q6[1] -; CHECK-NEXT: vmov q1[3], q1[1], r4, r1 -; CHECK-NEXT: vmov.u16 r4, q5[2] -; CHECK-NEXT: vmov q2[2], q2[0], r6, r4 -; CHECK-NEXT: vmov.u16 r4, q5[3] -; CHECK-NEXT: vmov.u16 r6, q5[1] -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmov q2[3], q2[1], r6, r4 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vmov r1, r11, d3 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov.u16 r6, q6[4] -; CHECK-NEXT: vadd.i16 q5, q5, q0 -; CHECK-NEXT: ldrh.w r10, [r5] -; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: ldrh r2, [r3] -; CHECK-NEXT: ldrh.w r9, [r7] -; CHECK-NEXT: vmov.u16 r7, q4[4] -; CHECK-NEXT: ldrh.w r8, [r8] +; CHECK-NEXT: vstrw.32 q5, [r2] +; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: vldrh.s32 q0, [r2, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: vldrh.s32 q0, [r2] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vmov r6, r2, d4 ; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh.w r12, [r4] +; CHECK-NEXT: add r4, sp, #88 +; CHECK-NEXT: ldrh.w r11, [r5] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r6] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vstrw.32 q6, [r4] +; CHECK-NEXT: vldrh.s32 q0, [r4] +; CHECK-NEXT: vmov.16 q7[0], r5 +; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r6, r9, d0 +; CHECK-NEXT: vmov r2, r5, d1 +; CHECK-NEXT: vldrh.s32 q0, [r4, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q1[0], r6 +; CHECK-NEXT: ldrh.w r6, [r9] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q1[1], r6 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov r2, r6, d0 +; CHECK-NEXT: vmov.16 q1[3], r5 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov r2, r5, d1 +; CHECK-NEXT: vmov.16 q1[5], r6 +; CHECK-NEXT: mov r6, r10 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vstrw.32 q4, [r10] +; CHECK-NEXT: vldrh.s32 q0, [r6] +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.16 q1[7], r5 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r2, r5, d0 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.16 q3[1], r5 +; CHECK-NEXT: vmov r2, r5, d5 +; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload +; CHECK-NEXT: vadd.i16 q6, q6, q2 +; CHECK-NEXT: vadd.i16 q5, q5, q2 +; CHECK-NEXT: vadd.i16 q4, q4, q2 +; CHECK-NEXT: ldrh.w r9, [r2] +; CHECK-NEXT: vmov r2, r4, d1 +; CHECK-NEXT: vldrh.s32 q0, [r6, #8] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q7[2], r9 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vmov.16 q7[3], r5 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov.16 q7[4], r1 +; CHECK-NEXT: vmov.16 q7[5], r3 +; CHECK-NEXT: vmov.16 q7[6], r12 +; CHECK-NEXT: vmov.16 q7[7], r11 +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q7[0], r4 -; CHECK-NEXT: ldrh r4, [r5] -; CHECK-NEXT: vmov.16 q7[1], r4 -; CHECK-NEXT: vmov r4, r5, d2 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.16 q3[3], r4 +; CHECK-NEXT: vmov r2, r4, d0 +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q1[0], r4 -; CHECK-NEXT: ldrh r4, [r5] -; CHECK-NEXT: vmov.u16 r5, q6[6] -; CHECK-NEXT: vmov q3[2], q3[0], r6, r5 -; CHECK-NEXT: vmov.u16 r5, q6[7] -; CHECK-NEXT: vmov.u16 r6, q6[5] -; CHECK-NEXT: vmov.16 q1[1], r4 -; CHECK-NEXT: vmov q3[3], q3[1], r6, r5 -; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: ldrh.w r5, [r11] -; CHECK-NEXT: vshl.i32 q3, q3, #1 -; CHECK-NEXT: vadd.i16 q6, q6, q0 -; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vmov.16 q1[3], r5 -; CHECK-NEXT: vmov r1, r4, d6 -; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q3[4], r2 +; CHECK-NEXT: vmov.16 q3[5], r4 +; CHECK-NEXT: vmov r2, r4, d1 +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q1[4], r1 -; CHECK-NEXT: vmov r1, r3, d7 -; CHECK-NEXT: vmov.16 q1[5], r4 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q1[6], r1 -; CHECK-NEXT: vmov r1, r4, d5 -; CHECK-NEXT: ldrh r6, [r1] -; CHECK-NEXT: ldrh r1, [r3] -; CHECK-NEXT: vmov.u16 r3, q4[2] -; CHECK-NEXT: ldrh r5, [r4] -; CHECK-NEXT: vmov.u16 r4, q4[0] -; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q4[3] -; CHECK-NEXT: vmov.u16 r4, q4[1] -; CHECK-NEXT: vmov.16 q1[7], r1 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r3 -; CHECK-NEXT: vmov.u16 r4, q4[6] -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmov.16 q7[2], r6 -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vmov.16 q7[3], r5 -; CHECK-NEXT: vadd.i32 q3, q2, r0 -; CHECK-NEXT: vmov.16 q7[4], r2 -; CHECK-NEXT: vmov r1, r3, d6 -; CHECK-NEXT: vmov.16 q7[5], r8 -; CHECK-NEXT: vmov.16 q7[6], r9 -; CHECK-NEXT: vmov.16 q7[7], r10 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: ldrh r1, [r3] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov r1, r3, d7 -; CHECK-NEXT: vmov q3[2], q3[0], r7, r4 -; CHECK-NEXT: vmov.u16 r4, q4[7] -; CHECK-NEXT: vmov.u16 r7, q4[5] -; CHECK-NEXT: vadd.i16 q4, q4, q0 -; CHECK-NEXT: vmov q3[3], q3[1], r7, r4 -; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vshl.i32 q3, q3, #1 -; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.16 q2[3], r3 -; CHECK-NEXT: vmov r1, r3, d6 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.16 q2[5], r3 -; CHECK-NEXT: vmov r1, r3, d7 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.16 q2[7], r3 -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q1, q1, q7 -; CHECK-NEXT: vstrb.8 q1, [r12], #16 +; CHECK-NEXT: vmov.16 q3[6], r2 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: vmov.16 q3[7], r4 +; CHECK-NEXT: vadd.i16 q0, q3, q1 +; CHECK-NEXT: vadd.i16 q0, q0, q7 +; CHECK-NEXT: vstrb.8 q0, [r7], #16 ; CHECK-NEXT: le lr, .LBB12_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: ldr r2, [sp, #52] @ 4-byte Reload -; CHECK-NEXT: cmp r2, r1 +; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #68] @ 4-byte Reload +; CHECK-NEXT: cmp r1, r3 ; CHECK-NEXT: bne.w .LBB12_2 ; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: add sp, #136 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} Index: llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -283,26 +283,35 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov r1, r2, d1 -; CHECK-NEXT: vmov r12, r3, d0 +; CHECK-NEXT: vmov r3, r1, d1 +; CHECK-NEXT: vmov r12, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: vmov r0, lr, d1 -; CHECK-NEXT: ldrh r7, [r2] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh.w r2, [r12] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov lr, r0, d1 +; CHECK-NEXT: ldrh r7, [r1] +; CHECK-NEXT: ldrh.w r1, [r12] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: vmov r0, r5, d0 ; CHECK-NEXT: ldrh.w r6, [lr] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov q1[3], q1[1], r3, r7 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r6 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r6 +; CHECK-NEXT: vmov.16 q0[3], r4 +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r7 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r0, #8] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 @@ -316,26 +325,35 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov r1, r2, d1 -; CHECK-NEXT: vmov r12, r3, d0 +; CHECK-NEXT: vmov r3, r1, d1 +; CHECK-NEXT: vmov r12, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: vmov r0, lr, d1 -; CHECK-NEXT: ldrh r7, [r2] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh.w r2, [r12] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov lr, r0, d1 +; CHECK-NEXT: ldrh r7, [r1] +; CHECK-NEXT: ldrh.w r1, [r12] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: vmov r0, r5, d0 ; CHECK-NEXT: ldrh.w r6, [lr] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov q1[3], q1[1], r3, r7 -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r6 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r6 +; CHECK-NEXT: vmov.16 q0[3], r4 +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r7 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 Index: llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -275,88 +275,50 @@ define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8i32(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: ext_add_ashr_trunc_i8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vmov.u8 r1, q1[14] -; CHECK-NEXT: vmov.u8 r2, q1[12] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q1[15] -; CHECK-NEXT: vmov.u8 r2, q1[13] -; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q1[10] -; CHECK-NEXT: vmovlb.s8 q4, q4 -; CHECK-NEXT: vmov.u8 r2, q1[8] -; CHECK-NEXT: vmovlb.s16 q4, q4 -; CHECK-NEXT: vadd.i32 q3, q4, q3 -; CHECK-NEXT: vshr.u32 q3, q3, #1 -; CHECK-NEXT: vstrb.32 q3, [r0, #12] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q1[11] -; CHECK-NEXT: vmov.u8 r2, q1[9] -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: vmov.u8 r2, q0[8] -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q1[6] -; CHECK-NEXT: vmovlb.s8 q4, q4 -; CHECK-NEXT: vmov.u8 r2, q1[4] -; CHECK-NEXT: vmovlb.s16 q4, q4 -; CHECK-NEXT: vadd.i32 q3, q4, q3 -; CHECK-NEXT: vshr.u32 q3, q3, #1 -; CHECK-NEXT: vstrb.32 q3, [r0, #8] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q1[7] -; CHECK-NEXT: vmov.u8 r2, q1[5] -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q1[2] -; CHECK-NEXT: vmovlb.s8 q4, q4 -; CHECK-NEXT: vmov.u8 r2, q1[0] -; CHECK-NEXT: vmovlb.s16 q4, q4 -; CHECK-NEXT: vadd.i32 q3, q4, q3 -; CHECK-NEXT: vshr.u32 q3, q3, #1 -; CHECK-NEXT: vstrb.32 q3, [r0, #4] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q1[3] -; CHECK-NEXT: vmov.u8 r2, q1[1] -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: vand q1, q3, q2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.u8 r2, q0[1] -; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 -; CHECK-NEXT: vmovlb.s8 q0, q2 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .pad #112 +; CHECK-NEXT: sub sp, #112 +; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: mov r4, sp +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r4] +; CHECK-NEXT: vldrb.u16 q0, [r1, #8] +; CHECK-NEXT: add r3, sp, #64 +; CHECK-NEXT: add r5, sp, #32 +; CHECK-NEXT: add r0, sp, #80 +; CHECK-NEXT: vstrw.32 q0, [r3] +; CHECK-NEXT: add r2, sp, #48 +; CHECK-NEXT: vldrb.s16 q0, [r4, #8] +; CHECK-NEXT: vstrw.32 q0, [r5] +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: add r1, sp, #96 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrb.s16 q0, [r4] +; CHECK-NEXT: vstrw.32 q0, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r3, #8] +; CHECK-NEXT: vldrh.s32 q1, [r5, #8] +; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrb.32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vstrb.32 q0, [r1, #12] +; CHECK-NEXT: vldrh.u32 q0, [r3] +; CHECK-NEXT: vldrh.s32 q1, [r5] +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vstrb.32 q0, [r1, #8] +; CHECK-NEXT: vldrh.u32 q0, [r0, #8] +; CHECK-NEXT: vldrh.s32 q1, [r2, #8] +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vstrb.32 q0, [r1, #4] +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r2] +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vstrb.32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add sp, #112 +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %sa = sext <16 x i8> %a to <16 x i32> %sb = zext <16 x i8> %b to <16 x i32> Index: llvm/test/CodeGen/Thumb2/mve-sext.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-sext.ll +++ llvm/test/CodeGen/Thumb2/mve-sext.ll @@ -125,41 +125,13 @@ define arm_aapcs_vfpcc <16 x i16> @sext_v16i8_v16i16(<16 x i8> %src) { ; CHECK-LABEL: sext_v16i8_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmovlb.s8 q2, q1 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrb.s16 q0, [r0] +; CHECK-NEXT: vldrb.s16 q1, [r0, #8] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %0 = sext <16 x i8> %src to <16 x i16> @@ -169,21 +141,13 @@ define arm_aapcs_vfpcc <8 x i32> @sext_v8i16_v8i32(<8 x i16> %src) { ; CHECK-LABEL: sext_v8i16_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmovlb.s16 q2, q1 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r0, #8] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %0 = sext <8 x i16> %src to <8 x i32> @@ -193,42 +157,21 @@ define arm_aapcs_vfpcc <16 x i32> @sext_v16i8_v16i32(<16 x i8> %src) { ; CHECK-LABEL: sext_v16i8_v16i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmovlb.s16 q4, q1 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: vmovlb.s8 q2, q2 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmovlb.s8 q0, q3 -; CHECK-NEXT: vmovlb.s16 q3, q0 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: add r1, sp, #32 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrb.s16 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vldrb.s16 q0, [r0, #8] +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vldrh.s32 q2, [r0] +; CHECK-NEXT: vldrh.s32 q3, [r0, #8] +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: bx lr entry: %0 = sext <16 x i8> %src to <16 x i32> @@ -285,41 +228,13 @@ define arm_aapcs_vfpcc <16 x i16> @zext_v16i8_v16i16(<16 x i8> %src) { ; CHECK-LABEL: zext_v16i8_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmovlb.u8 q2, q1 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrb.u16 q0, [r0] +; CHECK-NEXT: vldrb.u16 q1, [r0, #8] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %0 = zext <16 x i8> %src to <16 x i16> @@ -329,21 +244,13 @@ define arm_aapcs_vfpcc <8 x i32> @zext_v8i16_v8i32(<8 x i16> %src) { ; CHECK-LABEL: zext_v8i16_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmovlb.u16 q2, q1 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %0 = zext <8 x i16> %src to <8 x i32> @@ -353,39 +260,21 @@ define arm_aapcs_vfpcc <16 x i32> @zext_v16i8_v16i32(<16 x i8> %src) { ; CHECK-LABEL: zext_v16i8_v16i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.i32 q3, #0xff -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vand q4, q1, q3 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vmov q5[2], q5[0], r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov q5[3], q5[1], r1, r0 -; CHECK-NEXT: vand q3, q5, q3 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: add r1, sp, #32 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrb.u16 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vldrb.u16 q0, [r0, #8] +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrh.u32 q1, [r1, #8] +; CHECK-NEXT: vldrh.u32 q2, [r0] +; CHECK-NEXT: vldrh.u32 q3, [r0, #8] +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: bx lr entry: %0 = zext <16 x i8> %src to <16 x i32> Index: llvm/test/CodeGen/Thumb2/mve-shuffleext.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shuffleext.ll +++ llvm/test/CodeGen/Thumb2/mve-shuffleext.ll @@ -17,23 +17,16 @@ define arm_aapcs_vfpcc <4 x i32> @sext_i32_0246_swapped(<8 x i16> %src) { ; CHECK-LABEL: sext_i32_0246_swapped: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmovlb.s16 q0, q1 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r0, #8] ; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %out = sext <8 x i16> %src to <8 x i32> @@ -55,24 +48,17 @@ define arm_aapcs_vfpcc <4 x i32> @sext_i32_1357_swapped(<8 x i16> %src) { ; CHECK-LABEL: sext_i32_1357_swapped: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.f32 s1, s7 -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmov.f32 s2, s9 -; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q2, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r0, #8] +; CHECK-NEXT: vmov.f32 s0, s9 +; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %out = sext <8 x i16> %src to <8 x i32> @@ -95,40 +81,23 @@ define arm_aapcs_vfpcc <8 x i32> @sext_i32_02468101214_swapped(<16 x i16> %src) { ; CHECK-LABEL: sext_i32_02468101214_swapped: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmovlb.s16 q0, q2 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmovlb.s16 q3, q3 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r0, #8] +; CHECK-NEXT: vldrh.s32 q2, [r1, #8] ; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov.f32 s3, s14 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmovlb.s16 q1, q2 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmovlb.s16 q3, q3 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vldrh.s32 q1, [r1] ; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s14 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: %out = sext <16 x i16> %src to <16 x i32> @@ -151,42 +120,25 @@ define arm_aapcs_vfpcc <8 x i32> @sext_i32_13579111315_swapped(<16 x i16> %src) { ; CHECK-LABEL: sext_i32_13579111315_swapped: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q2, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r0, #8] +; CHECK-NEXT: vldrh.s32 q3, [r1] ; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.u16 r1, q1[0] ; CHECK-NEXT: vmov.f32 s1, s11 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.f32 s2, s13 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov.f32 s3, s15 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vmov.f32 s5, s11 -; CHECK-NEXT: vmov.f32 s6, s13 -; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vldrh.s32 q2, [r1, #8] +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmov.f32 s4, s13 +; CHECK-NEXT: vmov.f32 s5, s15 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: %out = sext <16 x i16> %src to <16 x i32> @@ -208,17 +160,16 @@ define arm_aapcs_vfpcc <4 x i32> @zext_i32_0246_swapped(<8 x i16> %src) { ; CHECK-LABEL: zext_i32_0246_swapped: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmovlb.u16 q2, q0 -; CHECK-NEXT: vmovlb.u16 q0, q1 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] ; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %out = zext <8 x i16> %src to <8 x i32> @@ -240,18 +191,17 @@ define arm_aapcs_vfpcc <4 x i32> @zext_i32_1357_swapped(<8 x i16> %src) { ; CHECK-LABEL: zext_i32_1357_swapped: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vmovlb.u16 q2, q0 -; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vmov.f32 s1, s7 -; CHECK-NEXT: vmov.f32 s2, s9 -; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q2, [r0] +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vmov.f32 s0, s9 +; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %out = zext <8 x i16> %src to <8 x i32> @@ -274,28 +224,23 @@ define arm_aapcs_vfpcc <8 x i32> @zext_i32_02468101214_swapped(<16 x i16> %src) { ; CHECK-LABEL: zext_i32_02468101214_swapped: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmovlb.u16 q3, q0 -; CHECK-NEXT: vmovlb.u16 q0, q2 -; CHECK-NEXT: vmov.u16 r1, q1[0] +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vldrh.u32 q2, [r1, #8] ; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.f32 s3, s14 -; CHECK-NEXT: vmovlb.u16 q3, q1 -; CHECK-NEXT: vmovlb.u16 q1, q2 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vldrh.u32 q1, [r1] ; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s14 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: %out = zext <16 x i16> %src to <16 x i32> @@ -318,30 +263,25 @@ define arm_aapcs_vfpcc <8 x i32> @zext_i32_13579111315_swapped(<16 x i16> %src) { ; CHECK-LABEL: zext_i32_13579111315_swapped: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmovlb.u16 q2, q2 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmovlb.u16 q3, q0 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q2, [r0] +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vldrh.u32 q3, [r1] ; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov.u16 r1, q1[1] ; CHECK-NEXT: vmov.f32 s1, s11 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov.f32 s2, s13 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmovlb.u16 q2, q2 -; CHECK-NEXT: vmov.f32 s3, s15 -; CHECK-NEXT: vmovlb.u16 q3, q1 -; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vmov.f32 s5, s11 -; CHECK-NEXT: vmov.f32 s6, s13 -; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vldrh.u32 q2, [r1, #8] +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmov.f32 s4, s13 +; CHECK-NEXT: vmov.f32 s5, s15 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: %out = zext <16 x i16> %src to <16 x i32> Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -524,48 +524,37 @@ define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i32_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vcmp.i16 eq, q1, zr -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov.i8 q1, #0x0 -; CHECK-NEXT: vmov.i8 q2, #0xff -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vmovlb.u8 q0, q1 +; CHECK-NEXT: vcmp.i16 eq, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vldrh.u32 q2, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: vpsel q1, q2, q1 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.i32 q4, #0xffff -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vpst -; CHECK-NEXT: vandt q2, q3, q4 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmovlb.u16 q0, q3 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vpt.i32 ne, q3, zr -; CHECK-NEXT: vaddt.i32 q2, q2, q0 -; CHECK-NEXT: vaddv.u32 r0, q2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vldrh.u32 q0, [r0, #8] +; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vpt.i32 ne, q2, zr +; CHECK-NEXT: vaddt.i32 q1, q1, q0 +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i8> %b, zeroinitializer Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -309,36 +309,23 @@ define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) { ; CHECK-LABEL: add_v8i8_v8i32_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: add r0, sp, #16 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmullb.u16 q2, q3, q2 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmullb.u16 q0, q1, q3 -; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, #8] +; CHECK-NEXT: vldrh.u32 q1, [r1, #8] +; CHECK-NEXT: vldrh.u32 q2, [r1] +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0] +; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i8> %x to <8 x i32> Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -489,62 +489,44 @@ define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i32_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: add r0, sp, #16 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmovlb.u8 q2, q2 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmovlb.u16 q4, q3 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vcmp.i16 eq, q2, zr -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: vmovlb.u16 q5, q3 -; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.u16 r1, q2[0] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.u16 r1, q2[1] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vmovlb.u8 q0, q2 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vcmp.i16 eq, q0, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vldrh.u32 q2, [r0] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov.u16 r3, q0[1] +; CHECK-NEXT: vldrh.u32 q3, [r1] +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i32 q3, q5, q4 -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 +; CHECK-NEXT: vmult.i32 q1, q3, q2 +; CHECK-NEXT: vldrh.u32 q2, [r0, #8] +; CHECK-NEXT: vldrh.u32 q3, [r1, #8] ; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmul.i32 q2, q3, q2 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q0[7] ; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.u16 r1, q2[4] -; CHECK-NEXT: vmullb.u16 q0, q1, q4 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.u16 r1, q2[5] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vpt.i32 ne, q1, zr -; CHECK-NEXT: vaddt.i32 q3, q3, q0 -; CHECK-NEXT: vaddv.u32 r0, q3 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vpt.i32 ne, q3, zr +; CHECK-NEXT: vaddt.i32 q1, q1, q2 +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i8> %b, zeroinitializer Index: llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll +++ llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll @@ -712,7 +712,7 @@ define arm_aapcs_vfpcc <8 x i32> @sext16_02461357_0ext(<16 x i16> %src1, i16 %src2) { ; CHECK-LABEL: sext16_02461357_0ext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 q2, r0 +; CHECK-NEXT: vdup.16 q2, r0 ; CHECK-NEXT: vrev32.16 q1, q0 ; CHECK-NEXT: vmullb.s16 q1, q1, q2 ; CHECK-NEXT: vmullb.s16 q0, q0, q2 @@ -731,7 +731,7 @@ ; CHECK-LABEL: sext16_0ext_02461357: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vrev32.16 q1, q0 -; CHECK-NEXT: vdup.32 q2, r0 +; CHECK-NEXT: vdup.16 q2, r0 ; CHECK-NEXT: vmullb.s16 q1, q2, q1 ; CHECK-NEXT: vmullb.s16 q0, q2, q0 ; CHECK-NEXT: bx lr @@ -922,7 +922,7 @@ define arm_aapcs_vfpcc <8 x i32> @zext16_02461357_0ext(<16 x i16> %src1, i16 %src2) { ; CHECK-LABEL: zext16_02461357_0ext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 q2, r0 +; CHECK-NEXT: vdup.16 q2, r0 ; CHECK-NEXT: vrev32.16 q1, q0 ; CHECK-NEXT: vmullb.u16 q1, q1, q2 ; CHECK-NEXT: vmullb.u16 q0, q0, q2 @@ -941,7 +941,7 @@ ; CHECK-LABEL: zext16_0ext_02461357: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vrev32.16 q1, q0 -; CHECK-NEXT: vdup.32 q2, r0 +; CHECK-NEXT: vdup.16 q2, r0 ; CHECK-NEXT: vmullb.u16 q1, q2, q1 ; CHECK-NEXT: vmullb.u16 q0, q2, q0 ; CHECK-NEXT: bx lr @@ -1132,7 +1132,7 @@ define arm_aapcs_vfpcc <16 x i16> @sext8_0246810121413579111315_0ext(<32 x i8> %src1, i8 %src2) { ; CHECK-LABEL: sext8_0246810121413579111315_0ext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.16 q2, r0 +; CHECK-NEXT: vdup.8 q2, r0 ; CHECK-NEXT: vrev16.8 q1, q0 ; CHECK-NEXT: vmullb.s8 q1, q1, q2 ; CHECK-NEXT: vmullb.s8 q0, q0, q2 @@ -1151,7 +1151,7 @@ ; CHECK-LABEL: sext8_0ext_0246810121413579111315: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vrev16.8 q1, q0 -; CHECK-NEXT: vdup.16 q2, r0 +; CHECK-NEXT: vdup.8 q2, r0 ; CHECK-NEXT: vmullb.s8 q1, q2, q1 ; CHECK-NEXT: vmullb.s8 q0, q2, q0 ; CHECK-NEXT: bx lr @@ -1342,7 +1342,7 @@ define arm_aapcs_vfpcc <16 x i16> @zext8_0246810121413579111315_0ext(<32 x i8> %src1, i8 %src2) { ; CHECK-LABEL: zext8_0246810121413579111315_0ext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.16 q2, r0 +; CHECK-NEXT: vdup.8 q2, r0 ; CHECK-NEXT: vrev16.8 q1, q0 ; CHECK-NEXT: vmullb.u8 q1, q1, q2 ; CHECK-NEXT: vmullb.u8 q0, q0, q2 @@ -1361,7 +1361,7 @@ ; CHECK-LABEL: zext8_0ext_0246810121413579111315: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vrev16.8 q1, q0 -; CHECK-NEXT: vdup.16 q2, r0 +; CHECK-NEXT: vdup.8 q2, r0 ; CHECK-NEXT: vmullb.u8 q1, q2, q1 ; CHECK-NEXT: vmullb.u8 q0, q2, q0 ; CHECK-NEXT: bx lr Index: llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll +++ llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll @@ -349,43 +349,47 @@ define void @foo_int32_int8_both(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) { -; CHECK-LABEL: foo_int32_int8_both: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s16 q1, [r1, #8] -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.u16 r3, q1[4] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.u16 r3, q1[5] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmovlb.u16 q2, q0 -; CHECK-NEXT: vldrb.s16 q0, [r1] -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vstrw.32 q2, [r0, #48] -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmovlb.u16 q1, q2 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vstrw.32 q1, [r0, #32] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 -; CHECK-NEXT: vmovlb.u16 q0, q1 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: foo_int32_int8_both: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #32 +; CHECK-LE-NEXT: sub sp, #32 +; CHECK-LE-NEXT: vldrb.s16 q0, [r1, #8] +; CHECK-LE-NEXT: add r2, sp, #16 +; CHECK-LE-NEXT: vstrw.32 q0, [r2] +; CHECK-LE-NEXT: vldrb.s16 q0, [r1] +; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: vstrw.32 q0, [r1] +; CHECK-LE-NEXT: vldrh.u32 q0, [r2, #8] +; CHECK-LE-NEXT: vstrw.32 q0, [r0, #48] +; CHECK-LE-NEXT: vldrh.u32 q0, [r2] +; CHECK-LE-NEXT: vstrw.32 q0, [r0, #32] +; CHECK-LE-NEXT: vldrh.u32 q0, [r1, #8] +; CHECK-LE-NEXT: vstrw.32 q0, [r0, #16] +; CHECK-LE-NEXT: vldrh.u32 q0, [r1] +; CHECK-LE-NEXT: vstrw.32 q0, [r0] +; CHECK-LE-NEXT: add sp, #32 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: foo_int32_int8_both: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #32 +; CHECK-BE-NEXT: sub sp, #32 +; CHECK-BE-NEXT: vldrb.s16 q0, [r1, #8] +; CHECK-BE-NEXT: add r2, sp, #16 +; CHECK-BE-NEXT: vstrh.16 q0, [r2] +; CHECK-BE-NEXT: vldrb.s16 q0, [r1] +; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: vstrh.16 q0, [r1] +; CHECK-BE-NEXT: vldrh.u32 q0, [r2, #8] +; CHECK-BE-NEXT: vstrw.32 q0, [r0, #48] +; CHECK-BE-NEXT: vldrh.u32 q0, [r2] +; CHECK-BE-NEXT: vstrw.32 q0, [r0, #32] +; CHECK-BE-NEXT: vldrh.u32 q0, [r1, #8] +; CHECK-BE-NEXT: vstrw.32 q0, [r0, #16] +; CHECK-BE-NEXT: vldrh.u32 q0, [r1] +; CHECK-BE-NEXT: vstrw.32 q0, [r0] +; CHECK-BE-NEXT: add sp, #32 +; CHECK-BE-NEXT: bx lr entry: %wide.load = load <16 x i8>, <16 x i8>* %src, align 1 %0 = sext <16 x i8> %wide.load to <16 x i16> @@ -416,12 +420,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r1, #32]! ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vldrh.s32 q2, [r1, #16] -; CHECK-NEXT: vldrh.s32 q3, [r1, #24] +; CHECK-NEXT: vldrh.s32 q2, [r1, #24] +; CHECK-NEXT: vldrh.s32 q3, [r1, #16] ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vstrw.32 q2, [r0, #32] +; CHECK-NEXT: vstrw.32 q2, [r0, #48] ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: vstrw.32 q3, [r0, #48] +; CHECK-NEXT: vstrw.32 q3, [r0, #32] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: