diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -1080,12 +1080,11 @@ [llvm_i32_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], llvm_anyvector_ty>; -def int_arm_mve_vld2q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem]>; -def int_arm_mve_vld4q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem]>; +def int_arm_mve_vld2q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem, IntrArgMemOnly]>; +def int_arm_mve_vld4q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem, IntrArgMemOnly]>; -def int_arm_mve_vst2q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrWriteMem]>; -def int_arm_mve_vst4q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrWriteMem] ->; +def int_arm_mve_vst2q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrWriteMem, IntrArgMemOnly]>; +def int_arm_mve_vst4q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrWriteMem, IntrArgMemOnly]>; // MVE vector absolute difference and accumulate across vector // The first operand is an 'unsigned' flag. The remaining operands are: diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -266,7 +266,7 @@ /// pointer points to a set of NumVecs sub-opcodes used for the /// different stages (e.g. VLD20 versus VLD21) of each load family. void SelectMVE_VLD(SDNode *N, unsigned NumVecs, - const uint16_t *const *Opcodes); + const uint16_t *const *Opcodes, bool HasWriteback); /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used @@ -2038,6 +2038,7 @@ const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { + assert(Subtarget->hasNEON()); assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); SDLoc dl(N); @@ -2177,6 +2178,7 @@ const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { + assert(Subtarget->hasNEON()); assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); SDLoc dl(N); @@ -2328,6 +2330,7 @@ unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes) { + assert(Subtarget->hasNEON()); assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); SDLoc dl(N); @@ -2673,7 +2676,8 @@ } void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs, - const uint16_t *const *Opcodes) { + const uint16_t *const *Opcodes, + bool HasWriteback) { EVT VT = N->getValueType(0); SDLoc Loc(N); @@ -2693,23 +2697,35 @@ } EVT DataTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, NumVecs * 2); - EVT ResultTys[] = {DataTy, MVT::Other}; + SmallVector ResultTys = {DataTy, MVT::Other}; + unsigned PtrOperand = HasWriteback ? 1 : 2; auto Data = SDValue( CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, DataTy), 0); SDValue Chain = N->getOperand(0); - for (unsigned Stage = 0; Stage < NumVecs; ++Stage) { - SDValue Ops[] = {Data, N->getOperand(2), Chain}; + // Add a MVE_VLDn instruction for each Vec, except the last + for (unsigned Stage = 0; Stage < NumVecs - 1; ++Stage) { + SDValue Ops[] = {Data, N->getOperand(PtrOperand), Chain}; auto LoadInst = CurDAG->getMachineNode(OurOpcodes[Stage], Loc, ResultTys, Ops); Data = SDValue(LoadInst, 0); Chain = SDValue(LoadInst, 1); } + // The last may need a writeback on it + if (HasWriteback) + ResultTys = {DataTy, MVT::i32, MVT::Other}; + SDValue Ops[] = {Data, N->getOperand(PtrOperand), Chain}; + auto LoadInst = + CurDAG->getMachineNode(OurOpcodes[NumVecs - 1], Loc, ResultTys, Ops); - for (unsigned i = 0; i < NumVecs; i++) + unsigned i; + for (i = 0; i < NumVecs; i++) ReplaceUses(SDValue(N, i), - CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT, Data)); - ReplaceUses(SDValue(N, NumVecs), Chain); + CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT, + SDValue(LoadInst, 0))); + if (HasWriteback) + ReplaceUses(SDValue(N, i++), SDValue(LoadInst, 1)); + ReplaceUses(SDValue(N, i), SDValue(LoadInst, HasWriteback ? 2 : 1)); CurDAG->RemoveDeadNode(N); } @@ -2718,6 +2734,7 @@ const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { + assert(Subtarget->hasNEON()); assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); SDLoc dl(N); @@ -3877,14 +3894,24 @@ } case ARMISD::VLD2_UPD: { - static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed, - ARM::VLD2d16wb_fixed, - ARM::VLD2d32wb_fixed, - ARM::VLD1q64wb_fixed}; - static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed, - ARM::VLD2q16PseudoWB_fixed, - ARM::VLD2q32PseudoWB_fixed }; - SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr); + if (Subtarget->hasNEON()) { + static const uint16_t DOpcodes[] = { + ARM::VLD2d8wb_fixed, ARM::VLD2d16wb_fixed, ARM::VLD2d32wb_fixed, + ARM::VLD1q64wb_fixed}; + static const uint16_t QOpcodes[] = {ARM::VLD2q8PseudoWB_fixed, + ARM::VLD2q16PseudoWB_fixed, + ARM::VLD2q32PseudoWB_fixed}; + SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr); + } else { + static const uint16_t Opcodes8[] = {ARM::MVE_VLD20_8, + ARM::MVE_VLD21_8_wb}; + static const uint16_t Opcodes16[] = {ARM::MVE_VLD20_16, + ARM::MVE_VLD21_16_wb}; + static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32, + ARM::MVE_VLD21_32_wb}; + static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; + SelectMVE_VLD(N, 2, Opcodes, true); + } return; } @@ -3904,17 +3931,30 @@ } case ARMISD::VLD4_UPD: { - static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, - ARM::VLD4d16Pseudo_UPD, - ARM::VLD4d32Pseudo_UPD, - ARM::VLD1d64QPseudoWB_fixed}; - static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, - ARM::VLD4q16Pseudo_UPD, - ARM::VLD4q32Pseudo_UPD }; - static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD, - ARM::VLD4q16oddPseudo_UPD, - ARM::VLD4q32oddPseudo_UPD }; - SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + if (Subtarget->hasNEON()) { + static const uint16_t DOpcodes[] = { + ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD, ARM::VLD4d32Pseudo_UPD, + ARM::VLD1d64QPseudoWB_fixed}; + static const uint16_t QOpcodes0[] = {ARM::VLD4q8Pseudo_UPD, + ARM::VLD4q16Pseudo_UPD, + ARM::VLD4q32Pseudo_UPD}; + static const uint16_t QOpcodes1[] = {ARM::VLD4q8oddPseudo_UPD, + ARM::VLD4q16oddPseudo_UPD, + ARM::VLD4q32oddPseudo_UPD}; + SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } else { + static const uint16_t Opcodes8[] = {ARM::MVE_VLD40_8, ARM::MVE_VLD41_8, + ARM::MVE_VLD42_8, + ARM::MVE_VLD43_8_wb}; + static const uint16_t Opcodes16[] = {ARM::MVE_VLD40_16, ARM::MVE_VLD41_16, + ARM::MVE_VLD42_16, + ARM::MVE_VLD43_16_wb}; + static const uint16_t Opcodes32[] = {ARM::MVE_VLD40_32, ARM::MVE_VLD41_32, + ARM::MVE_VLD42_32, + ARM::MVE_VLD43_32_wb}; + static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; + SelectMVE_VLD(N, 4, Opcodes, true); + } return; } @@ -3962,15 +4002,17 @@ } case ARMISD::VST2_UPD: { - static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed, - ARM::VST2d16wb_fixed, - ARM::VST2d32wb_fixed, - ARM::VST1q64wb_fixed}; - static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed, - ARM::VST2q16PseudoWB_fixed, - ARM::VST2q32PseudoWB_fixed }; - SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr); - return; + if (Subtarget->hasNEON()) { + static const uint16_t DOpcodes[] = { + ARM::VST2d8wb_fixed, ARM::VST2d16wb_fixed, ARM::VST2d32wb_fixed, + ARM::VST1q64wb_fixed}; + static const uint16_t QOpcodes[] = {ARM::VST2q8PseudoWB_fixed, + ARM::VST2q16PseudoWB_fixed, + ARM::VST2q32PseudoWB_fixed}; + SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr); + return; + } + break; } case ARMISD::VST3_UPD: { @@ -3989,18 +4031,20 @@ } case ARMISD::VST4_UPD: { - static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD, - ARM::VST4d16Pseudo_UPD, - ARM::VST4d32Pseudo_UPD, - ARM::VST1d64QPseudoWB_fixed}; - static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, - ARM::VST4q16Pseudo_UPD, - ARM::VST4q32Pseudo_UPD }; - static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, - ARM::VST4q16oddPseudo_UPD, - ARM::VST4q32oddPseudo_UPD }; - SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); - return; + if (Subtarget->hasNEON()) { + static const uint16_t DOpcodes[] = { + ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD, ARM::VST4d32Pseudo_UPD, + ARM::VST1d64QPseudoWB_fixed}; + static const uint16_t QOpcodes0[] = {ARM::VST4q8Pseudo_UPD, + ARM::VST4q16Pseudo_UPD, + ARM::VST4q32Pseudo_UPD}; + static const uint16_t QOpcodes1[] = {ARM::VST4q8oddPseudo_UPD, + ARM::VST4q16oddPseudo_UPD, + ARM::VST4q32oddPseudo_UPD}; + SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + return; + } + break; } case ARMISD::VST2LN_UPD: { @@ -4479,7 +4523,7 @@ static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32, ARM::MVE_VLD21_32}; static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; - SelectMVE_VLD(N, 2, Opcodes); + SelectMVE_VLD(N, 2, Opcodes, false); return; } @@ -4493,7 +4537,7 @@ ARM::MVE_VLD42_32, ARM::MVE_VLD43_32}; static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; - SelectMVE_VLD(N, 4, Opcodes); + SelectMVE_VLD(N, 4, Opcodes, false); return; } } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -912,8 +912,6 @@ setOperationAction(ISD::FMA, MVT::v4f32, Expand); } - setTargetDAGCombine(ISD::INTRINSIC_VOID); - setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); @@ -942,6 +940,8 @@ setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_VOID); } if (!Subtarget->hasFP64()) { @@ -13281,6 +13281,128 @@ return CombineBaseUpdate(N, DCI); } +static SDValue PerformMVEVLDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDValue Addr = N->getOperand(2); + MemSDNode *MemN = cast(N); + SDLoc dl(N); + + // For the stores, where there are multiple intrinsics we only actually want + // to post-inc the last of the them. + unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::arm_mve_vst2q && + cast(N->getOperand(5))->getZExtValue() != 1) + return SDValue(); + if (IntNo == Intrinsic::arm_mve_vst4q && + cast(N->getOperand(7))->getZExtValue() != 3) + return SDValue(); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. We can avoid searching through Addr as it's a + // predecessor to both. + SmallPtrSet Visited; + SmallVector Worklist; + Visited.insert(Addr.getNode()); + Worklist.push_back(N); + Worklist.push_back(User); + if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || + SDNode::hasPredecessorHelper(User, Visited, Worklist)) + continue; + + // Find the new opcode for the updating load/store. + bool isLoadOp = true; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + switch (IntNo) { + default: + llvm_unreachable("unexpected intrinsic for MVE VLDn combine"); + case Intrinsic::arm_mve_vld2q: + NewOpc = ARMISD::VLD2_UPD; + NumVecs = 2; + break; + case Intrinsic::arm_mve_vld4q: + NewOpc = ARMISD::VLD4_UPD; + NumVecs = 4; + break; + case Intrinsic::arm_mve_vst2q: + NewOpc = ARMISD::VST2_UPD; + NumVecs = 2; + isLoadOp = false; + break; + case Intrinsic::arm_mve_vst4q: + NewOpc = ARMISD::VST4_UPD; + NumVecs = 4; + isLoadOp = false; + break; + } + + // Find the size of memory referenced by the load/store. + EVT VecTy; + if (isLoadOp) { + VecTy = N->getValueType(0); + } else { + VecTy = N->getOperand(3).getValueType(); + } + + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + ConstantSDNode *CInc = dyn_cast(Inc.getNode()); + if (!CInc || CInc->getZExtValue() != NumBytes) + continue; + + // Create the new updating load/store node. + // First, create an SDVTList for the new updating node's results. + EVT Tys[6]; + unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = VecTy; + Tys[n++] = MVT::i32; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); + + // Then, gather the new node's operands. + SmallVector Ops; + Ops.push_back(N->getOperand(0)); // incoming chain + Ops.push_back(N->getOperand(2)); // ptr + Ops.push_back(Inc); + + for (unsigned i = 3; i < N->getNumOperands(); ++i) + Ops.push_back(N->getOperand(i)); + + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy, + MemN->getMemOperand()); + + // Update the uses. + SmallVector NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) + NewResults.push_back(SDValue(UpdN.getNode(), i)); + + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + + return SDValue(); +} + /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and @@ -14744,6 +14866,11 @@ case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: return PerformVLDCombine(N, DCI); + case Intrinsic::arm_mve_vld2q: + case Intrinsic::arm_mve_vld4q: + case Intrinsic::arm_mve_vst2q: + case Intrinsic::arm_mve_vst4q: + return PerformMVEVLDCombine(N, DCI); default: break; } break; @@ -16595,6 +16722,34 @@ Info.flags = MachineMemOperand::MOStore; return true; } + case Intrinsic::arm_mve_vld2q: + case Intrinsic::arm_mve_vld4q: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + Type *VecTy = cast(I.getType())->getTypeAtIndex(1); + unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4; + Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = Align(VecTy->getScalarSizeInBits() / 8); + // volatile loads with MVE intrinsics not supported + Info.flags = MachineMemOperand::MOLoad; + return true; + } + case Intrinsic::arm_mve_vst2q: + case Intrinsic::arm_mve_vst4q: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + Type *VecTy = I.getArgOperand(1)->getType(); + unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4; + Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = Align(VecTy->getScalarSizeInBits() / 8); + // volatile stores with MVE intrinsics not supported + Info.flags = MachineMemOperand::MOStore; + return true; + } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -5053,22 +5053,44 @@ "vst" # n.nvecs # stage # "." # s.lanesize>; } +def SDTARMVST2 : SDTypeProfile<1, 5, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVec<3>, + SDTCisSameAs<3, 4>, SDTCisVT<5, i32>]>; +def SDTARMVST4 : SDTypeProfile<1, 7, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVec<3>, + SDTCisSameAs<3, 4>, SDTCisSameAs<3, 5>, + SDTCisSameAs<3, 6>, SDTCisVT<7, i32>]>; +def MVEVST2UPD : SDNode<"ARMISD::VST2_UPD", SDTARMVST2, [SDNPHasChain]>; +def MVEVST4UPD : SDNode<"ARMISD::VST4_UPD", SDTARMVST4, [SDNPHasChain]>; + multiclass MVE_vst24_patterns { foreach stage = [0,1] in def : Pat<(int_arm_mve_vst2q i32:$addr, - (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)), + (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)), (!cast("MVE_VST2"#stage#"_"#lanesize) - (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), - t2_addr_offset_none:$addr)>; + (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + t2_addr_offset_none:$addr)>; + foreach stage = [0,1] in + def : Pat<(i32 (MVEVST2UPD i32:$addr, (i32 32), + (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage))), + (i32 (!cast("MVE_VST2"#stage#"_"#lanesize#_wb) + (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + t2_addr_offset_none:$addr))>; foreach stage = [0,1,2,3] in def : Pat<(int_arm_mve_vst4q i32:$addr, - (VT MQPR:$v0), (VT MQPR:$v1), - (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)), + (VT MQPR:$v0), (VT MQPR:$v1), + (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)), (!cast("MVE_VST4"#stage#"_"#lanesize) - (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, - VT:$v2, qsub_2, VT:$v3, qsub_3), - t2_addr_offset_none:$addr)>; + (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), + t2_addr_offset_none:$addr)>; + foreach stage = [0,1,2,3] in + def : Pat<(i32 (MVEVST4UPD i32:$addr, (i32 64), + (VT MQPR:$v0), (VT MQPR:$v1), + (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage))), + (i32 (!cast("MVE_VST4"#stage#"_"#lanesize#_wb) + (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), + t2_addr_offset_none:$addr))>; } defm : MVE_vst24_patterns<8, v16i8>; defm : MVE_vst24_patterns<16, v8i16>; diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll @@ -25,8 +25,7 @@ ; CHECK-LABEL: test_vld2q_f16_post: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] -; CHECK-NEXT: vld21.16 {q0, q1}, [r0] -; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -66,8 +65,7 @@ ; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -102,8 +100,7 @@ ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: vst20.32 {q0, q1}, [r0] -; CHECK-NEXT: vst21.32 {q0, q1}, [r0] -; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: vst21.32 {q0, q1}, [r0]! ; CHECK-NEXT: bx lr entry: %value.coerce.fca.0.0.extract = extractvalue %struct.uint32x4x2_t %value.coerce, 0, 0 @@ -138,8 +135,7 @@ ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: vst20.16 {q0, q1}, [r0] -; CHECK-NEXT: vst21.16 {q0, q1}, [r0] -; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: vst21.16 {q0, q1}, [r0]! ; CHECK-NEXT: bx lr entry: %value.coerce.fca.0.0.extract = extractvalue %struct.float16x8x2_t %value.coerce, 0, 0 @@ -186,8 +182,7 @@ ; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst43.8 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vst43.8 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: bx lr entry: %value.coerce.fca.0.0.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll b/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll --- a/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll +++ b/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll @@ -6,16 +6,17 @@ define arm_aapcs_vfpcc void @spill_multivector(<4 x i32>* %p) { ; CHECK-LABEL: spill_multivector: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #112 ; CHECK-NEXT: sub sp, #112 ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: add.w lr, sp, #64 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vld21.32 {q0, q1}, [r0] +; CHECK-NEXT: vld21.32 {q0, q1}, [r5]! ; CHECK-NEXT: adds r0, #64 ; CHECK-NEXT: vstmia lr, {d0, d1, d2, d3} @ 32-byte Spill ; CHECK-NEXT: add.w lr, sp, #32 @@ -44,13 +45,13 @@ ; CHECK-NEXT: add r0, sp, #64 ; CHECK-NEXT: vstrw.32 q2, [r4, #48] ; CHECK-NEXT: vstrw.32 q6, [r4, #96] -; CHECK-NEXT: vstrw.32 q1, [r4, #32] +; CHECK-NEXT: vstrw.32 q1, [r5] ; CHECK-NEXT: vldmia r0, {d2, d3, d4, d5} @ 32-byte Reload ; CHECK-NEXT: vstrw.32 q2, [r4, #16] ; CHECK-NEXT: vstrw.32 q1, [r4] ; CHECK-NEXT: add sp, #112 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %ip01 = bitcast <4 x i32>* %p to i32* %v01 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.v4i32.p0i32(i32* %ip01) diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll @@ -7,8 +7,7 @@ ; CHECK-LABEL: vld2_v4i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] -; CHECK-NEXT: vld21.32 {q0, q1}, [r0] -; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -28,8 +27,7 @@ ; CHECK-LABEL: vld2_v8i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] -; CHECK-NEXT: vld21.16 {q0, q1}, [r0] -; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! ; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -49,8 +47,7 @@ ; CHECK-LABEL: vld2_v16i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vld20.8 {q0, q1}, [r0] -; CHECK-NEXT: vld21.8 {q0, q1}, [r0] -; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: vld21.8 {q0, q1}, [r0]! ; CHECK-NEXT: vadd.i8 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -114,8 +111,7 @@ ; CHECK-LABEL: vld2_v4f32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] -; CHECK-NEXT: vld21.32 {q0, q1}, [r0] -; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -135,8 +131,7 @@ ; CHECK-LABEL: vld2_v8f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] -; CHECK-NEXT: vld21.16 {q0, q1}, [r0] -; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -45,14 +45,13 @@ define void @vld2_v8i32(<16 x i32> *%src, <8 x i32> *%dst) { ; CHECK-LABEL: vld2_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add.w r2, r0, #32 ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] -; CHECK-NEXT: vld20.32 {q2, q3}, [r2] -; CHECK-NEXT: vld21.32 {q0, q1}, [r0] -; CHECK-NEXT: vld21.32 {q2, q3}, [r2] +; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! +; CHECK-NEXT: vld20.32 {q2, q3}, [r0] ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q1, q2, q3 +; CHECK-NEXT: vld21.32 {q2, q3}, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vadd.i32 q1, q2, q3 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: bx lr entry: @@ -70,17 +69,16 @@ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] -; CHECK-NEXT: add.w r12, r0, #96 -; CHECK-NEXT: add.w r3, r0, #32 -; CHECK-NEXT: add.w r2, r0, #64 -; CHECK-NEXT: vld21.32 {q0, q1}, [r0] +; CHECK-NEXT: add.w r2, r0, #96 +; CHECK-NEXT: add.w r3, r0, #64 +; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vld20.32 {q1, q2}, [r2] -; CHECK-NEXT: vld20.32 {q3, q4}, [r12] -; CHECK-NEXT: vld20.32 {q5, q6}, [r3] -; CHECK-NEXT: vld21.32 {q5, q6}, [r3] -; CHECK-NEXT: vld21.32 {q1, q2}, [r2] -; CHECK-NEXT: vld21.32 {q3, q4}, [r12] +; CHECK-NEXT: vld20.32 {q1, q2}, [r3] +; CHECK-NEXT: vld20.32 {q3, q4}, [r2] +; CHECK-NEXT: vld20.32 {q5, q6}, [r0] +; CHECK-NEXT: vld21.32 {q5, q6}, [r0] +; CHECK-NEXT: vld21.32 {q1, q2}, [r3] +; CHECK-NEXT: vld21.32 {q3, q4}, [r2] ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vadd.i32 q5, q5, q6 @@ -162,14 +160,13 @@ define void @vld2_v16i16(<32 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vld2_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add.w r2, r0, #32 ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] -; CHECK-NEXT: vld20.16 {q2, q3}, [r2] -; CHECK-NEXT: vld21.16 {q0, q1}, [r0] -; CHECK-NEXT: vld21.16 {q2, q3}, [r2] +; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! +; CHECK-NEXT: vld20.16 {q2, q3}, [r0] ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q1, q2, q3 +; CHECK-NEXT: vld21.16 {q2, q3}, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vadd.i16 q1, q2, q3 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: bx lr entry: @@ -408,14 +405,13 @@ define void @vld2_v8f32(<16 x float> *%src, <8 x float> *%dst) { ; CHECK-LABEL: vld2_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add.w r2, r0, #32 ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] -; CHECK-NEXT: vld20.32 {q2, q3}, [r2] -; CHECK-NEXT: vld21.32 {q0, q1}, [r0] -; CHECK-NEXT: vld21.32 {q2, q3}, [r2] +; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! +; CHECK-NEXT: vld20.32 {q2, q3}, [r0] ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vadd.f32 q1, q2, q3 +; CHECK-NEXT: vld21.32 {q2, q3}, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vadd.f32 q1, q2, q3 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: bx lr entry: @@ -433,17 +429,16 @@ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] -; CHECK-NEXT: add.w r12, r0, #96 -; CHECK-NEXT: add.w r3, r0, #32 -; CHECK-NEXT: add.w r2, r0, #64 -; CHECK-NEXT: vld21.32 {q0, q1}, [r0] +; CHECK-NEXT: add.w r2, r0, #96 +; CHECK-NEXT: add.w r3, r0, #64 +; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vld20.32 {q1, q2}, [r2] -; CHECK-NEXT: vld20.32 {q3, q4}, [r12] -; CHECK-NEXT: vld20.32 {q5, q6}, [r3] -; CHECK-NEXT: vld21.32 {q5, q6}, [r3] -; CHECK-NEXT: vld21.32 {q1, q2}, [r2] -; CHECK-NEXT: vld21.32 {q3, q4}, [r12] +; CHECK-NEXT: vld20.32 {q1, q2}, [r3] +; CHECK-NEXT: vld20.32 {q3, q4}, [r2] +; CHECK-NEXT: vld20.32 {q5, q6}, [r0] +; CHECK-NEXT: vld21.32 {q5, q6}, [r0] +; CHECK-NEXT: vld21.32 {q1, q2}, [r3] +; CHECK-NEXT: vld21.32 {q3, q4}, [r2] ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vadd.f32 q5, q5, q6 @@ -552,15 +547,14 @@ define void @vld2_v16f16(<32 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld2_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add.w r2, r0, #32 -; CHECK-NEXT: vld20.16 {q0, q1}, [r2] -; CHECK-NEXT: vld21.16 {q0, q1}, [r2] +; CHECK-NEXT: vld20.16 {q0, q1}, [r0] +; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! +; CHECK-NEXT: vld20.16 {q2, q3}, [r0] ; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vld20.16 {q1, q2}, [r0] -; CHECK-NEXT: vld21.16 {q1, q2}, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vadd.f16 q0, q1, q2 +; CHECK-NEXT: vld21.16 {q2, q3}, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vadd.f16 q2, q2, q3 +; CHECK-NEXT: vstrw.32 q2, [r1, #16] ; CHECK-NEXT: bx lr entry: %l1 = load <32 x half>, <32 x half>* %src, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll @@ -11,8 +11,7 @@ ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.i32 q4, q2, q3 ; CHECK-NEXT: vadd.i32 q0, q0, q1 @@ -44,8 +43,7 @@ ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.i16 q4, q2, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q1 @@ -77,8 +75,7 @@ ; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.i8 q4, q2, q3 ; CHECK-NEXT: vadd.i8 q0, q0, q1 @@ -188,8 +185,7 @@ ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.f32 q4, q2, q3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 @@ -221,8 +217,7 @@ ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.f16 q4, q2, q3 ; CHECK-NEXT: vadd.f16 q0, q0, q1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -78,38 +78,27 @@ define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) { ; CHECK-LABEL: vld4_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5} -; CHECK-NEXT: push {r4, r5} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 -; CHECK-NEXT: add.w r2, r0, #64 -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q1, q4 -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q4, q6, q7 -; CHECK-NEXT: vadd.i32 q5, q1, q0 -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q5, q2, q3 +; CHECK-NEXT: vadd.i32 q6, q2, q3 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q5 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: add sp, #88 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5} +; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vadd.i32 q0, q0, q6 +; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 +; CHECK-NEXT: vadd.i32 q5, q3, q4 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vadd.i32 q1, q1, q5 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i32>, <32 x i32>* %src, align 4 @@ -129,77 +118,66 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5} ; CHECK-NEXT: push {r4, r5} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #152 -; CHECK-NEXT: sub sp, #152 -; CHECK-NEXT: add.w r2, r0, #128 -; CHECK-NEXT: add r3, sp, #64 -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: add r4, sp, #64 -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstmia r3, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: add.w r3, r0, #64 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .pad #136 +; CHECK-NEXT: sub sp, #136 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: add.w r3, r0, #192 ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: adds r0, #192 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.i32 q4, q2, q3 -; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q4, q5, q0 -; CHECK-NEXT: vldmia r4, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: add r4, sp, #64 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vstrw.32 q4, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] +; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] +; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] +; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] +; CHECK-NEXT: vldrw.u32 q6, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vadd.i32 q6, q5, q6 +; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vstrw.32 q6, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q1, q3, q5 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstmia r4, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r3] -; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r3] -; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r3] -; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r3] -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: add r3, sp, #64 -; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q4, q4, q5 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, q7 -; CHECK-NEXT: vadd.i32 q0, q4, q0 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldmia r3, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: add r0, sp, #64 -; CHECK-NEXT: @ kill: def $q4 killed $q4 killed $q4_q5_q6_q7 -; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov q2, q5 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vadd.i32 q5, q6, q0 -; CHECK-NEXT: vldmia r0, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q4, q4, q5 -; CHECK-NEXT: vadd.i32 q5, q2, q3 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrw.32 q4, [r1, #48] +; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 +; CHECK-NEXT: vadd.i32 q2, q3, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q1, q5, q6 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #152 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #136 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr entry: @@ -341,38 +319,27 @@ define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vld4_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5} -; CHECK-NEXT: push {r4, r5} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 -; CHECK-NEXT: add.w r2, r0, #64 -; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.16 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld41.16 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld42.16 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld43.16 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q1, q4 -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.i16 q4, q6, q7 -; CHECK-NEXT: vadd.i16 q5, q1, q0 -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: vadd.i16 q4, q5, q4 -; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i16 q5, q2, q3 +; CHECK-NEXT: vadd.i16 q6, q2, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q0, q5 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: add sp, #88 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5} +; CHECK-NEXT: vld40.16 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vadd.i16 q0, q0, q6 +; CHECK-NEXT: vld41.16 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld42.16 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld43.16 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 +; CHECK-NEXT: vadd.i16 q5, q3, q4 +; CHECK-NEXT: vadd.i16 q1, q1, q2 +; CHECK-NEXT: vadd.i16 q1, q1, q5 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i16>, <64 x i16>* %src, align 4 @@ -844,38 +811,27 @@ define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) { ; CHECK-LABEL: vld4_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5} -; CHECK-NEXT: push {r4, r5} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 -; CHECK-NEXT: add.w r2, r0, #64 -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q1, q4 -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 q4, q6, q7 -; CHECK-NEXT: vadd.f32 q5, q1, q0 -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q5, q2, q3 +; CHECK-NEXT: vadd.f32 q6, q2, q3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vadd.f32 q0, q0, q5 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: add sp, #88 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5} +; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vadd.f32 q0, q0, q6 +; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 +; CHECK-NEXT: vadd.f32 q5, q3, q4 +; CHECK-NEXT: vadd.f32 q1, q1, q2 +; CHECK-NEXT: vadd.f32 q1, q1, q5 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x float>, <32 x float>* %src, align 4 @@ -895,77 +851,66 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5} ; CHECK-NEXT: push {r4, r5} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #152 -; CHECK-NEXT: sub sp, #152 -; CHECK-NEXT: add.w r2, r0, #128 -; CHECK-NEXT: add r3, sp, #64 -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: add r4, sp, #64 -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstmia r3, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: add.w r3, r0, #64 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .pad #136 +; CHECK-NEXT: sub sp, #136 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: add.w r3, r0, #192 ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: adds r0, #192 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.f32 q4, q2, q3 -; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vadd.f32 q4, q5, q0 -; CHECK-NEXT: vldmia r4, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: add r4, sp, #64 +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vstrw.32 q4, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] +; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] +; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] +; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] +; CHECK-NEXT: vldrw.u32 q6, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vadd.f32 q6, q5, q6 +; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 q0, q0, q2 +; CHECK-NEXT: vstrw.32 q6, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vadd.f32 q1, q3, q5 +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstmia r4, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r3] -; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r3] -; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r3] -; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r3] -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: add r3, sp, #64 -; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vadd.f32 q4, q4, q5 -; CHECK-NEXT: vadd.f32 q0, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 q0, q0, q7 -; CHECK-NEXT: vadd.f32 q0, q4, q0 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldmia r3, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: add r0, sp, #64 -; CHECK-NEXT: @ kill: def $q4 killed $q4 killed $q4_q5_q6_q7 -; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov q2, q5 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 q4, q4, q2 -; CHECK-NEXT: vadd.f32 q5, q6, q0 -; CHECK-NEXT: vldmia r0, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q4, q4, q5 -; CHECK-NEXT: vadd.f32 q5, q2, q3 -; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vstrw.32 q4, [r1, #48] +; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 q0, q0, q5 +; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload +; CHECK-NEXT: vadd.f32 q1, q2, q1 +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 +; CHECK-NEXT: vadd.f32 q2, q3, q4 +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vadd.f32 q1, q5, q6 +; CHECK-NEXT: vadd.f32 q1, q2, q1 +; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #152 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #136 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr entry: @@ -1126,28 +1071,37 @@ define void @vld4_v16f16(<64 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld4_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: add.w r2, r0, #64 -; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r2] +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #88 +; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! +; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vld40.16 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld41.16 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld42.16 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vld43.16 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: @ kill: def $q4 killed $q4 killed $q4_q5_q6_q7 +; CHECK-NEXT: vadd.f16 q0, q6, q7 +; CHECK-NEXT: vadd.f16 q4, q4, q5 +; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vadd.f16 q4, q4, q0 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vstrw.32 q4, [r1, #16] ; CHECK-NEXT: vadd.f16 q4, q2, q3 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vadd.f16 q0, q0, q4 -; CHECK-NEXT: vld40.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld41.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vadd.f16 q0, q3, q4 -; CHECK-NEXT: vadd.f16 q1, q1, q2 -; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x half>, <64 x half>* %src, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll @@ -8,9 +8,9 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r1, #32 ; CHECK-NEXT: vst20.32 {q0, q1}, [r1] -; CHECK-NEXT: vst21.32 {q0, q1}, [r1] +; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 @@ -30,9 +30,9 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r1, #32 ; CHECK-NEXT: vst20.16 {q0, q1}, [r1] -; CHECK-NEXT: vst21.16 {q0, q1}, [r1] +; CHECK-NEXT: vst21.16 {q0, q1}, [r1]! +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -52,9 +52,9 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r1, #32 ; CHECK-NEXT: vst20.8 {q0, q1}, [r1] -; CHECK-NEXT: vst21.8 {q0, q1}, [r1] +; CHECK-NEXT: vst21.8 {q0, q1}, [r1]! +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0 @@ -102,9 +102,9 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r1, #32 ; CHECK-NEXT: vst20.32 {q0, q1}, [r1] -; CHECK-NEXT: vst21.32 {q0, q1}, [r1] +; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0 @@ -124,9 +124,9 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r1, #32 ; CHECK-NEXT: vst20.16 {q0, q1}, [r1] -; CHECK-NEXT: vst21.16 {q0, q1}, [r1] +; CHECK-NEXT: vst21.16 {q0, q1}, [r1]! +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -58,11 +58,10 @@ ; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: add.w r0, r1, #32 ; CHECK-NEXT: vst20.32 {q0, q1}, [r1] -; CHECK-NEXT: vst21.32 {q0, q1}, [r1] -; CHECK-NEXT: vst20.32 {q2, q3}, [r0] -; CHECK-NEXT: vst21.32 {q2, q3}, [r0] +; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! +; CHECK-NEXT: vst20.32 {q2, q3}, [r1] +; CHECK-NEXT: vst21.32 {q2, q3}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0 @@ -79,21 +78,20 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q7, [r0, #64] ; CHECK-NEXT: vldrw.u32 q1, [r0, #112] ; CHECK-NEXT: vldrw.u32 q3, [r0, #96] ; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vst20.32 {q6, q7}, [r1] ; CHECK-NEXT: add.w r0, r1, #96 ; CHECK-NEXT: add.w r2, r1, #64 -; CHECK-NEXT: add.w r3, r1, #32 -; CHECK-NEXT: vst20.32 {q6, q7}, [r1] -; CHECK-NEXT: vst21.32 {q6, q7}, [r1] -; CHECK-NEXT: vst20.32 {q4, q5}, [r3] -; CHECK-NEXT: vst21.32 {q4, q5}, [r3] +; CHECK-NEXT: vst21.32 {q6, q7}, [r1]! +; CHECK-NEXT: vst20.32 {q4, q5}, [r1] +; CHECK-NEXT: vst21.32 {q4, q5}, [r1] ; CHECK-NEXT: vst20.32 {q2, q3}, [r2] ; CHECK-NEXT: vst21.32 {q2, q3}, [r2] ; CHECK-NEXT: vst20.32 {q0, q1}, [r0] @@ -178,11 +176,10 @@ ; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: add.w r0, r1, #32 ; CHECK-NEXT: vst20.16 {q0, q1}, [r1] -; CHECK-NEXT: vst21.16 {q0, q1}, [r1] -; CHECK-NEXT: vst20.16 {q2, q3}, [r0] -; CHECK-NEXT: vst21.16 {q2, q3}, [r0] +; CHECK-NEXT: vst21.16 {q0, q1}, [r1]! +; CHECK-NEXT: vst20.16 {q2, q3}, [r1] +; CHECK-NEXT: vst21.16 {q2, q3}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0 @@ -385,11 +382,10 @@ ; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: add.w r0, r1, #32 ; CHECK-NEXT: vst20.32 {q0, q1}, [r1] -; CHECK-NEXT: vst21.32 {q0, q1}, [r1] -; CHECK-NEXT: vst20.32 {q2, q3}, [r0] -; CHECK-NEXT: vst21.32 {q2, q3}, [r0] +; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! +; CHECK-NEXT: vst20.32 {q2, q3}, [r1] +; CHECK-NEXT: vst21.32 {q2, q3}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0 @@ -406,21 +402,20 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q7, [r0, #64] ; CHECK-NEXT: vldrw.u32 q1, [r0, #112] ; CHECK-NEXT: vldrw.u32 q3, [r0, #96] ; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vst20.32 {q6, q7}, [r1] ; CHECK-NEXT: add.w r0, r1, #96 ; CHECK-NEXT: add.w r2, r1, #64 -; CHECK-NEXT: add.w r3, r1, #32 -; CHECK-NEXT: vst20.32 {q6, q7}, [r1] -; CHECK-NEXT: vst21.32 {q6, q7}, [r1] -; CHECK-NEXT: vst20.32 {q4, q5}, [r3] -; CHECK-NEXT: vst21.32 {q4, q5}, [r3] +; CHECK-NEXT: vst21.32 {q6, q7}, [r1]! +; CHECK-NEXT: vst20.32 {q4, q5}, [r1] +; CHECK-NEXT: vst21.32 {q4, q5}, [r1] ; CHECK-NEXT: vst20.32 {q2, q3}, [r2] ; CHECK-NEXT: vst21.32 {q2, q3}, [r2] ; CHECK-NEXT: vst20.32 {q0, q1}, [r0] @@ -531,15 +526,14 @@ define void @vst2_v16f16(<16 x half> *%src, <32 x half> *%dst) { ; CHECK-LABEL: vst2_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: add.w r0, r1, #32 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vst20.16 {q2, q3}, [r1] -; CHECK-NEXT: vst21.16 {q2, q3}, [r1] -; CHECK-NEXT: vst20.16 {q0, q1}, [r0] -; CHECK-NEXT: vst21.16 {q0, q1}, [r0] +; CHECK-NEXT: vst21.16 {q2, q3}, [r1]! +; CHECK-NEXT: vst20.16 {q0, q1}, [r1] +; CHECK-NEXT: vst21.16 {q0, q1}, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll @@ -10,11 +10,11 @@ ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r1, #64 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1]! +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 @@ -42,11 +42,11 @@ ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r1, #64 ; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1]! +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -74,11 +74,11 @@ ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r1, #64 ; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst43.8 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.8 {q0, q1, q2, q3}, [r1]! +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0 @@ -153,11 +153,11 @@ ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r1, #64 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1]! +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0 @@ -185,11 +185,11 @@ ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: add.w r0, r1, #64 ; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1] -; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1]! +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -80,23 +80,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vmov q7, q6 -; CHECK-NEXT: add.w r0, r1, #64 ; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1]! +; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -122,53 +121,59 @@ ; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #152 -; CHECK-NEXT: sub sp, #152 -; CHECK-NEXT: vldrw.u32 q2, [r0, #176] +; CHECK-NEXT: .pad #216 +; CHECK-NEXT: sub sp, #216 +; CHECK-NEXT: vldrw.u32 q2, [r0, #144] ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vldrw.u32 q6, [r0, #128] +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #112] -; CHECK-NEXT: vmov q7, q6 -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] +; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldrw.u32 q6, [r0, #160] +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vstmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldrw.u32 q5, [r0, #112] +; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vstmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #144] -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: add.w r0, r1, #64 -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: add.w r0, r1, #192 -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: adds r1, #128 ; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] @@ -177,7 +182,7 @@ ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: add sp, #152 +; CHECK-NEXT: add sp, #216 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr @@ -328,23 +333,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vmov q7, q6 -; CHECK-NEXT: add.w r0, r1, #64 ; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst42.16 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst43.16 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.16 {q4, q5, q6, q7}, [r1]! +; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -736,23 +740,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vmov q7, q6 -; CHECK-NEXT: add.w r0, r1, #64 ; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1]! +; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -778,53 +781,59 @@ ; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #152 -; CHECK-NEXT: sub sp, #152 -; CHECK-NEXT: vldrw.u32 q2, [r0, #176] +; CHECK-NEXT: .pad #216 +; CHECK-NEXT: sub sp, #216 +; CHECK-NEXT: vldrw.u32 q2, [r0, #144] ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vldrw.u32 q6, [r0, #128] +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #112] -; CHECK-NEXT: vmov q7, q6 -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] +; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldrw.u32 q6, [r0, #160] +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vstmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldrw.u32 q5, [r0, #112] +; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vstmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldmia sp, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #144] -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: add.w r0, r1, #64 -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: add.w r0, r1, #192 -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: adds r1, #128 ; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1] @@ -833,7 +842,7 @@ ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: add sp, #152 +; CHECK-NEXT: add sp, #216 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr @@ -1006,23 +1015,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: add.w r0, r1, #64 ; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst42.16 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst43.16 {q4, q5, q6, q7}, [r1] -; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.16 {q4, q5, q6, q7}, [r1]! +; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1] +; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: